pax_global_header00006660000000000000000000000064147013207010014505gustar00rootroot0000000000000052 comment=2ab78150830762ed3ea44e4afbd0556b31291fc0 maggma-0.70.0/000077500000000000000000000000001470132070100130225ustar00rootroot00000000000000maggma-0.70.0/.coveragerc000066400000000000000000000000371470132070100151430ustar00rootroot00000000000000[run] omit = *test*, rabbitmq* maggma-0.70.0/.github/000077500000000000000000000000001470132070100143625ustar00rootroot00000000000000maggma-0.70.0/.github/dependabot.yml000066400000000000000000000001661470132070100172150ustar00rootroot00000000000000version: 2 updates: - package-ecosystem: "github-actions" directory: "/" schedule: interval: "weekly" maggma-0.70.0/.github/pull_request_template.md000066400000000000000000000016501470132070100213250ustar00rootroot00000000000000## Summary Major changes: - feature 1: ... - fix 1: ... ## Todos If this is work in progress, what else needs to be done? - feature 2: ... - fix 2: ## Checklist - [ ] Google format doc strings added. - [ ] Code linted with `ruff`. (For guidance in fixing rule violates, see [rule list](https://beta.ruff.rs/docs/rules/)) - [ ] Type annotations included. Check with `mypy`. - [ ] Tests added for new features/fixes. - [ ] I have run the tests locally and they passed. Tip: Install `pre-commit` hooks to auto-check types and linting before every commit: ```sh pip install -U pre-commit pre-commit install ``` maggma-0.70.0/.github/release.yml000066400000000000000000000014001470132070100165200ustar00rootroot00000000000000changelog: exclude: authors: [dependabot, github-actions, pre-commit-ci] categories: - title: 🎉 New Features labels: [feature] - title: 🐛 Bug Fixes labels: [fix] - title: 🛠 Enhancements labels: [enhancement, DX, UX] - title: 📖 Documentation labels: [docs] - title: 🧹 House-Keeping labels: [housekeeping] - title: 🚀 Performance labels: [performance] - title: 💡 Refactoring labels: [refactor] - title: 🧪 Tests labels: [tests] - title: 💥 Breaking Changes labels: [breaking] - title: 🔒 Security Fixes labels: [security] - title: 🏥 Package Health labels: [pkg] - title: 🤷‍♂️ Other Changes labels: ["*"] maggma-0.70.0/.github/workflows/000077500000000000000000000000001470132070100164175ustar00rootroot00000000000000maggma-0.70.0/.github/workflows/codeql.yml000066400000000000000000000016351470132070100204160ustar00rootroot00000000000000name: "CodeQL" on: push: branches: [ "main" ] pull_request: branches: [ "main" ] schedule: - cron: "11 19 * * 6" concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true jobs: analyze: name: Analyze runs-on: ubuntu-latest permissions: actions: read contents: read security-events: write strategy: fail-fast: false matrix: language: [ python ] steps: - name: Checkout uses: actions/checkout@v4 - name: Initialize CodeQL uses: github/codeql-action/init@v2 with: languages: ${{ matrix.language }} queries: +security-and-quality - name: Autobuild uses: github/codeql-action/autobuild@v2 - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@v2 with: category: "/language:${{ matrix.language }}" maggma-0.70.0/.github/workflows/post-process.yml000066400000000000000000000011461470132070100216050ustar00rootroot00000000000000name: Post-process on: workflow_run: branches: - main types: - completed workflows: # List all required workflow names here. - 'testing' jobs: auto-gen-release: runs-on: ubuntu-latest env: GITHUB_TOKEN: ${{ secrets.SEMVER_BUMP_TOKEN }} steps: - # It is often a desired behavior to merge only when a workflow execution # succeeds. This can be changed as needed. if: ${{ github.event.workflow_run.conclusion == 'success' }} uses: rymndhng/release-on-push-action@v0.25.0 with: bump_version_scheme: norelease maggma-0.70.0/.github/workflows/release.yml000066400000000000000000000033251470132070100205650ustar00rootroot00000000000000name: release on: release: types: [published] jobs: deploy: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: python-version: "3.10" - name: Install dependencies run: | python -m pip install --upgrade pip pip install setuptools setuptools_scm wheel build - name: Build packages run: | python -m build --sdist --wheel - name: Publish package uses: pypa/gh-action-pypi-publish@master with: user: __token__ password: ${{ secrets.PYPY_API_TOKEN }} docs: runs-on: ubuntu-latest needs: - deploy steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: python-version: "3.10" - name: Install dependencies run: | python -m pip install --upgrade pip pip install -r requirements/ubuntu-latest_py3.10_extras.txt pip install -e . - name: Generate changelog uses: charmixer/auto-changelog-action@v1 with: token: ${{ secrets.GITHUB_TOKEN }} exclude_labels: dependencies - name: Commit files run: | git config --local user.email "feedback@materialsproject.org" git config --local user.name "materialsproject" mv CHANGELOG.md docs/ git add docs/CHANGELOG.md && git commit -m 'Updated CHANGELOG.md' - name: Push changes uses: ad-m/github-push-action@master with: github_token: ${{ secrets.GITHUB_TOKEN }} - name: Build run: mkdocs build - name: Deploy uses: peaceiris/actions-gh-pages@v4.0.0 with: github_token: ${{ secrets.GITHUB_TOKEN }} publish_dir: ./site maggma-0.70.0/.github/workflows/testing.yml000066400000000000000000000040721470132070100206220ustar00rootroot00000000000000name: testing on: push: branches: - main paths-ignore: - 'docs/CHANGELOG.md' pull_request: branches: - main concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true jobs: lint: runs-on: ubuntu-latest strategy: max-parallel: 1 steps: - uses: actions/checkout@v4 with: fetch-depth: 0 - name: Set up Python uses: actions/setup-python@v5 with: python-version: 3.11 cache: pip - name: Run pre-commit run: | pip install pre-commit pre-commit run --all-files test: needs: lint services: local_mongodb: image: mongo:4.0 ports: - 27017:27017 azurite: image: mcr.microsoft.com/azure-storage/azurite ports: - 10000:10000 strategy: max-parallel: 6 matrix: os: [ubuntu-latest] python-version: ["3.9", "3.10", "3.11", "3.12"] runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install Python dependencies run: | python -m pip install --upgrade pip pip install -r requirements/${{ matrix.os }}_py${{ matrix.python-version }}_extras.txt - name: Test with pytest env: CONTINUOUS_INTEGRATION: True MONGODB_SRV_URI: ${{ secrets.MONGODB_SRV_URI }} run: | pip install -e . pytest --cov=maggma --cov-report=xml - uses: codecov/codecov-action@v4.6.0 with: token: ${{ secrets.CODECOV_TOKEN }} file: ./coverage.xml docs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: python-version: "3.10" - name: Install dependencies run: | python -m pip install --upgrade pip pip install -e .[docs] - name: Build run: mkdocs build maggma-0.70.0/.github/workflows/upgrade-dependencies.yml000066400000000000000000000114461470132070100232230ustar00rootroot00000000000000# https://www.oddbird.net/2022/06/01/dependabot-single-pull-request/ # https://github.com/materialsproject/MPContribs/blob/master/.github/workflows/upgrade-dependencies.yml name: upgrade dependencies on: workflow_dispatch: # Allow running on-demand schedule: # Runs every Monday at 8:00 UTC (4:00 Eastern) - cron: '0 8 * * 1' jobs: upgrade: name: ${{ matrix.package }} (${{ matrix.os }}/py${{ matrix.python-version }}) runs-on: ${{ matrix.os }} strategy: matrix: os: ['ubuntu-latest', 'macos-latest'] package: ["."] python-version: ["3.9", "3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v4 with: fetch-depth: 0 - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} cache: 'pip' - name: Upgrade Python dependencies shell: bash run: | python${{ matrix.python-version }} -m pip install --upgrade pip pip-tools cd ${{ matrix.package }} python${{ matrix.python-version }} -m piptools compile -q --upgrade --resolver=backtracking -o requirements/${{ matrix.os }}_py${{ matrix.python-version }}.txt python${{ matrix.python-version }} -m piptools compile -q --upgrade --resolver=backtracking --all-extras -o requirements/${{ matrix.os }}_py${{ matrix.python-version }}_extras.txt - name: Detect changes id: changes shell: bash run: | #git diff-index HEAD ${{ matrix.package }}/requirements/${{ matrix.os }}_py${{ matrix.python-version }}*.txt | awk '{print $4}' | sort -u #sha1=$(git diff-index HEAD ${{ matrix.package }}/requirements/${{ matrix.os }}_py${{ matrix.python-version }}*.txt | awk '{print $4}' | sort -u | head -n1) #[[ $sha1 == "0000000000000000000000000000000000000000" ]] && git update-index --really-refresh ${{ matrix.package }}/requirements/${{ matrix.os }}_py${{ matrix.python-version }}*.txt echo "count=$(git diff-index HEAD ${{ matrix.package }}/requirements/${{ matrix.os }}_py${{ matrix.python-version }}*.txt | wc -l | xargs)" >> $GITHUB_OUTPUT echo "files=$(git ls-files --exclude-standard --others ${{ matrix.package }}/requirements/${{ matrix.os }}_py${{ matrix.python-version }}*.txt | wc -l | xargs)" >> $GITHUB_OUTPUT - name: commit & push changes if: steps.changes.outputs.count > 0 || steps.changes.outputs.files > 0 shell: bash run: | git config user.name github-actions git config user.email github-actions@github.com git add ${{ matrix.package }}/requirements git commit -m "update dependencies for ${{ matrix.package }} (${{ matrix.os }}/py${{ matrix.python-version }})" git push -f origin ${{ github.ref_name }}:auto-dependency-upgrades-${{ matrix.package }}-${{ matrix.os }}-py${{ matrix.python-version }} pull_request: name: Merge all branches and open PR runs-on: ubuntu-latest needs: upgrade steps: - uses: actions/checkout@v4 with: fetch-depth: 0 - name: detect auto-upgrade-dependency branches id: changes run: echo "count=$(git branch -r | grep auto-dependency-upgrades- | wc -l | xargs)" >> $GITHUB_OUTPUT - name: merge all auto-dependency-upgrades branches if: steps.changes.outputs.count > 0 run: | git config user.name github-actions git config user.email github-actions@github.com git checkout -b auto-dependency-upgrades git branch -r | grep auto-dependency-upgrades- | xargs -I {} git merge {} git rebase ${GITHUB_REF##*/} git push -f origin auto-dependency-upgrades git branch -r | grep auto-dependency-upgrades- | cut -d/ -f2 | xargs -I {} git push origin :{} - name: Open pull request if needed if: steps.changes.outputs.count > 0 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Only open a PR if the branch is not attached to an existing one # note that this auto-created PR will not trigger the testing workflow, which # is an intentional limitation imposed by GitHub. See # https://github.com/peter-evans/create-pull-request/blob/main/docs/concepts-guidelines.md#triggering-further-workflow-runs # The simplest workaround is to close and immediately re-open the Auto PR run: | PR=$(gh pr list --head auto-dependency-upgrades --json number -q '.[0].number') if [ -z $PR ]; then gh pr create \ --head auto-dependency-upgrades \ --title "Automated dependency upgrades" \ --body "Full log: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" else echo "Pull request already exists, won't create a new one." fi maggma-0.70.0/.gitignore000066400000000000000000000024031470132070100150110ustar00rootroot00000000000000# Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python env/ build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ source/apidoc/maggma.* source/apidoc/modules.rst # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # dotenv .env # virtualenv .venv venv/ ENV/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ # eclipse .project .pydevproject *~ .idea .DS_Store .vscode .pytest_cache maggma-0.70.0/.pre-commit-config.yaml000066400000000000000000000017671470132070100173160ustar00rootroot00000000000000default_stages: [commit] default_install_hook_types: [pre-commit, commit-msg] ci: autoupdate_schedule: monthly # skip: [mypy] autofix_commit_msg: pre-commit auto-fixes autoupdate_commit_msg: pre-commit autoupdate repos: - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.4.10 hooks: - id: ruff args: [--fix, --show-fixes, --ignore, D, --extend-select, D411] - repo: https://github.com/psf/black rev: 24.4.2 hooks: - id: black - repo: https://github.com/codespell-project/codespell rev: v2.3.0 hooks: - id: codespell stages: [commit, commit-msg] exclude_types: [html] additional_dependencies: [tomli] # needed to read pyproject.toml below py3.11 - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.6.0 hooks: - id: check-case-conflict - id: check-symlinks - id: destroyed-symlinks - id: end-of-file-fixer - id: mixed-line-ending - id: trailing-whitespace maggma-0.70.0/CODE_OF_CONDUCT.md000066400000000000000000000045561470132070100156330ustar00rootroot00000000000000# Contributor Code of Conduct As contributors and maintainers of this project, and in the interest of fostering an open and welcoming community, we pledge to respect all people who contribute through reporting issues, posting feature requests, updating documentation, submitting pull requests or patches, and other activities. We are committed to making participation in this project a harassment-free experience for everyone, regardless of level of experience, gender, gender identity and expression, sexual orientation, disability, personal appearance, body size, race, ethnicity, age, religion, or nationality. Examples of unacceptable behavior by participants include: * The use of sexualized language or imagery * Personal attacks * Trolling or insulting/derogatory comments * Public or private harassment * Publishing other's private information, such as physical or electronic addresses, without explicit permission * Other unethical or unprofessional conduct Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. By adopting this Code of Conduct, project maintainers commit themselves to fairly and consistently applying these principles to every aspect of managing this project. Project maintainers who do not follow or enforce the Code of Conduct may be permanently removed from the project team. This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting a project maintainer at conduct@materialsproject.org. All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. Maintainers are obligated to maintain confidentiality with regard to the reporter of an incident to the extent possible by law and institutional policy. This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.3.0, available at https://www.contributor-covenant.org/version/1/3/0/code-of-conduct.html [homepage]: https://www.contributor-covenant.org maggma-0.70.0/LICENSE000066400000000000000000000045611470132070100140350ustar00rootroot00000000000000maggma Copyright (c) 2017, The Regents of the University of California, through Lawrence Berkeley National Laboratory (subject to receipt of any required approvals from the U.S. Dept. of Energy). All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: (1) Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. (2) Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. (3) Neither the name of the University of California, Lawrence Berkeley National Laboratory, U.S. Dept. of Energy nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You are under no obligation whatsoever to provide any bug fixes, patches, or upgrades to the features, functionality or performance of the source code ("Enhancements") to anyone; however, if you choose to make your Enhancements available either publicly, or directly to Lawrence Berkeley National Laboratory or its contributors, without imposing a separate written license agreement for such Enhancements, then you hereby grant the following license: a non-exclusive, royalty-free perpetual license to install, use, modify, prepare derivative works, incorporate into other computer software, distribute, and sublicense such enhancements or derivative works thereof, in binary and source code form. maggma-0.70.0/README.md000066400000000000000000000113101470132070100142750ustar00rootroot00000000000000 # ![Maggma](docs/logo_w_text.svg) [![Static Badge](https://img.shields.io/badge/documentation-blue?logo=github)](https://materialsproject.github.io/maggma) [![testing](https://github.com/materialsproject/maggma/workflows/testing/badge.svg)](https://github.com/materialsproject/maggma/actions?query=workflow%3Atesting) [![codecov](https://codecov.io/gh/materialsproject/maggma/branch/main/graph/badge.svg)](https://codecov.io/gh/materialsproject/maggma) [![python](https://img.shields.io/badge/Python-3.9+-blue.svg?logo=python&logoColor=white)]() ## What is Maggma Maggma is a framework to build scientific data processing pipelines from data stored in a variety of formats -- databases, Azure Blobs, files on disk, etc., all the way to a REST API. The rest of this README contains a brief, high-level overview of what `maggma` can do. For more, please refer to [the documentation](https://materialsproject.github.io/maggma). ## Installation from PyPI Maggma is published on the [Python Package Index](https://pypi.org/project/maggma/). The preferred tool for installing packages from *PyPi* is **pip**. This tool is provided with all modern versions of Python. Open your terminal and run the following command. ``` shell pip install --upgrade maggma ``` ## Basic Concepts `maggma`'s core classes -- [`Store`](#store) and [`Builder`](#builder) -- provide building blocks for modular data pipelines. Data resides in one or more `Store` and is processed by a `Builder`. The results of the processing are saved in another `Store`, and so on: ```mermaid flowchart LR     s1(Store 1) --Builder 1--> s2(Store 2) --Builder 2--> s3(Store 3) s2 -- Builder 3-->s4(Store 4) ``` ### Store A major challenge in building scalable data pipelines is dealing with all the different types of data sources out there. Maggma's `Store` class provides a consistent, unified interface for querying data from arbitrary data sources. It was originally built around MongoDB, so it's interface closely resembles `PyMongo` syntax. However, Maggma makes it possible to use that same syntax to query other types of databases, such as Amazon S3, GridFS, or files on disk, [and many others](https://materialsproject.github.io/maggma/getting_started/stores/#list-of-stores). Stores implement methods to `connect`, `query`, find `distinct` values, `groupby` fields, `update` documents, and `remove` documents. The example below demonstrates inserting 4 documents (python `dicts`) into a `MongoStore` with `update`, then accessing the data using `count`, `query`, and `distinct`. ```python >>> turtles = [{"name": "Leonardo", "color": "blue", "tool": "sword"}, {"name": "Donatello","color": "purple", "tool": "staff"}, {"name": "Michelangelo", "color": "orange", "tool": "nunchuks"}, {"name":"Raphael", "color": "red", "tool": "sai"} ] >>> store = MongoStore(database="my_db_name", collection_name="my_collection_name", username="my_username", password="my_password", host="my_hostname", port=27017, key="name", ) >>> with store: store.update(turtles) >>> store.count() 4 >>> store.query_one({}) {'_id': ObjectId('66746d29a78e8431daa3463a'), 'name': 'Leonardo', 'color': 'blue', 'tool': 'sword'} >>> store.distinct('color') ['purple', 'orange', 'blue', 'red'] ``` ### Builder Builders represent a data processing step, analogous to an extract-transform-load (ETL) operation in a data warehouse model. Much like `Store` provides a consistent interface for accessing data, the `Builder` classes provide a consistent interface for transforming it. `Builder` transformation are each broken into 3 phases: `get_items`, `process_item`, and `update_targets`: 1. `get_items`: Retrieve items from the source Store(s) for processing by the next phase 2. `process_item`: Manipulate the input item and create an output document that is sent to the next phase for storage. 3. `update_target`: Add the processed item to the target Store(s). Both `get_items` and `update_targets` can perform IO (input/output) to the data stores. `process_item` is expected to not perform any IO so that it can be parallelized by Maggma. Builders can be chained together into an array and then saved as a JSON file to be run on a production system. ## Origin and Maintainers Maggma has been developed and is maintained by the [Materials Project](https://materialsproject.org/) team at Lawrence Berkeley National Laboratory and the [Materials Project Software Foundation](https://github.com/materialsproject/foundation). Maggma is written in [Python](http://docs.python-guide.org/en/latest/) and supports Python 3.9+. maggma-0.70.0/docs/000077500000000000000000000000001470132070100137525ustar00rootroot00000000000000maggma-0.70.0/docs/CHANGELOG.md000066400000000000000000002406441470132070100155750ustar00rootroot00000000000000# Changelog ## [v0.69.4](https://github.com/materialsproject/maggma/tree/v0.69.4) (2024-09-29) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.69.3...v0.69.4) **Closed issues:** - \[Feature Request\]: Support numpy 2.0 [\#990](https://github.com/materialsproject/maggma/issues/990) **Merged pull requests:** - clarify state of open data stores [\#997](https://github.com/materialsproject/maggma/pull/997) ([kbuma](https://github.com/kbuma)) - add python 3.12 to CI tests [\#992](https://github.com/materialsproject/maggma/pull/992) ([rkingsbury](https://github.com/rkingsbury)) ## [v0.69.3](https://github.com/materialsproject/maggma/tree/v0.69.3) (2024-08-23) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.69.2...v0.69.3) ## [v0.69.2](https://github.com/materialsproject/maggma/tree/v0.69.2) (2024-08-16) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.69.1...v0.69.2) **Merged pull requests:** - capability to configure query on request [\#985](https://github.com/materialsproject/maggma/pull/985) ([yang-ruoxi](https://github.com/yang-ruoxi)) - Automated dependency upgrades [\#983](https://github.com/materialsproject/maggma/pull/983) ([github-actions[bot]](https://github.com/apps/github-actions)) ## [v0.69.1](https://github.com/materialsproject/maggma/tree/v0.69.1) (2024-07-24) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.69.0...v0.69.1) ## [v0.69.0](https://github.com/materialsproject/maggma/tree/v0.69.0) (2024-07-01) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.68.6...v0.69.0) **Closed issues:** - \[Feature Request\]: Leverage optional dependency groups to reduce dependency count [\#928](https://github.com/materialsproject/maggma/issues/928) - Update README/docs to better reflect the purpose of Maggma [\#886](https://github.com/materialsproject/maggma/issues/886) **Merged pull requests:** - Store Documentation update [\#976](https://github.com/materialsproject/maggma/pull/976) ([rkingsbury](https://github.com/rkingsbury)) - Add content to README; documentation fixups [\#969](https://github.com/materialsproject/maggma/pull/969) ([rkingsbury](https://github.com/rkingsbury)) - Automated dependency upgrades [\#965](https://github.com/materialsproject/maggma/pull/965) ([github-actions[bot]](https://github.com/apps/github-actions)) ## [v0.68.6](https://github.com/materialsproject/maggma/tree/v0.68.6) (2024-06-20) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.68.5...v0.68.6) **Merged pull requests:** - move API to optional dependency group; move OpenData to default installation [\#970](https://github.com/materialsproject/maggma/pull/970) ([rkingsbury](https://github.com/rkingsbury)) ## [v0.68.5](https://github.com/materialsproject/maggma/tree/v0.68.5) (2024-06-20) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.68.4...v0.68.5) **Merged pull requests:** - mv mongogrant to optional dependency group [\#968](https://github.com/materialsproject/maggma/pull/968) ([rkingsbury](https://github.com/rkingsbury)) ## [v0.68.4](https://github.com/materialsproject/maggma/tree/v0.68.4) (2024-06-11) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.68.3...v0.68.4) ## [v0.68.3](https://github.com/materialsproject/maggma/tree/v0.68.3) (2024-06-11) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.68.2...v0.68.3) **Merged pull requests:** - Bugfix in sorting query operator [\#964](https://github.com/materialsproject/maggma/pull/964) ([munrojm](https://github.com/munrojm)) ## [v0.68.2](https://github.com/materialsproject/maggma/tree/v0.68.2) (2024-06-05) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.68.1...v0.68.2) ## [v0.68.1](https://github.com/materialsproject/maggma/tree/v0.68.1) (2024-05-30) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.68.0...v0.68.1) **Merged pull requests:** - Handle store error during finalize [\#958](https://github.com/materialsproject/maggma/pull/958) ([jmmshn](https://github.com/jmmshn)) ## [v0.68.0](https://github.com/materialsproject/maggma/tree/v0.68.0) (2024-05-27) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.67.0...v0.68.0) **Breaking changes:** - drop python 3.8 support [\#951](https://github.com/materialsproject/maggma/pull/951) ([rkingsbury](https://github.com/rkingsbury)) **Implemented enhancements:** - \[Feature Request\]: pass keyword arguments to zopen to accommodate non english platforms [\#932](https://github.com/materialsproject/maggma/issues/932) **Merged pull requests:** - Add support for python 3.12 to CI [\#954](https://github.com/materialsproject/maggma/pull/954) ([rkingsbury](https://github.com/rkingsbury)) - merge setup.py into pyproject.toml [\#952](https://github.com/materialsproject/maggma/pull/952) ([rkingsbury](https://github.com/rkingsbury)) ## [v0.67.0](https://github.com/materialsproject/maggma/tree/v0.67.0) (2024-05-13) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.66.0...v0.67.0) **Implemented enhancements:** - Add character encoding kwarg to JSONStore and FileStore [\#949](https://github.com/materialsproject/maggma/pull/949) ([rkingsbury](https://github.com/rkingsbury)) ## [v0.66.0](https://github.com/materialsproject/maggma/tree/v0.66.0) (2024-04-30) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.65.0...v0.66.0) **Merged pull requests:** - Add config option to sort query op [\#944](https://github.com/materialsproject/maggma/pull/944) ([munrojm](https://github.com/munrojm)) ## [v0.65.0](https://github.com/materialsproject/maggma/tree/v0.65.0) (2024-04-18) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.64.1...v0.65.0) **Closed issues:** - \[Feature Request\]: support ruamel.yaml 0.18+ [\#938](https://github.com/materialsproject/maggma/issues/938) **Merged pull requests:** - Adding store support for tasks stored in open data [\#943](https://github.com/materialsproject/maggma/pull/943) ([kbuma](https://github.com/kbuma)) - allow HEAD method for `/heartbeat` [\#874](https://github.com/materialsproject/maggma/pull/874) ([tschaume](https://github.com/tschaume)) ## [v0.64.1](https://github.com/materialsproject/maggma/tree/v0.64.1) (2024-04-16) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.64.0...v0.64.1) ## [v0.64.0](https://github.com/materialsproject/maggma/tree/v0.64.0) (2024-03-17) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.63.4...v0.64.0) **Implemented enhancements:** - Enable `recursive_msonable` in `jsanitize` calls [\#930](https://github.com/materialsproject/maggma/pull/930) ([Andrew-S-Rosen](https://github.com/Andrew-S-Rosen)) ## [v0.63.4](https://github.com/materialsproject/maggma/tree/v0.63.4) (2024-02-29) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.63.3...v0.63.4) **Merged pull requests:** - write all NaN and NaT Dataframe created values as null [\#929](https://github.com/materialsproject/maggma/pull/929) ([kbuma](https://github.com/kbuma)) ## [v0.63.3](https://github.com/materialsproject/maggma/tree/v0.63.3) (2024-02-21) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.63.2...v0.63.3) **Implemented enhancements:** - Tweak docstrings to fix rendered docs [\#923](https://github.com/materialsproject/maggma/pull/923) ([ml-evs](https://github.com/ml-evs)) ## [v0.63.2](https://github.com/materialsproject/maggma/tree/v0.63.2) (2024-02-16) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.63.1...v0.63.2) **Merged pull requests:** - enables using more efficient queries for count, distinct and newer\_in [\#921](https://github.com/materialsproject/maggma/pull/921) ([kbuma](https://github.com/kbuma)) ## [v0.63.1](https://github.com/materialsproject/maggma/tree/v0.63.1) (2024-02-14) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.63.0...v0.63.1) **Merged pull requests:** - fix open data store connect and close and address future warnings for pandas [\#920](https://github.com/materialsproject/maggma/pull/920) ([kbuma](https://github.com/kbuma)) ## [v0.63.0](https://github.com/materialsproject/maggma/tree/v0.63.0) (2024-02-13) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.62.1...v0.63.0) **Merged pull requests:** - open data refactor for integration with builders [\#919](https://github.com/materialsproject/maggma/pull/919) ([kbuma](https://github.com/kbuma)) ## [v0.62.1](https://github.com/materialsproject/maggma/tree/v0.62.1) (2024-02-05) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.62.0...v0.62.1) **Merged pull requests:** - chunking for json normalization [\#914](https://github.com/materialsproject/maggma/pull/914) ([kbuma](https://github.com/kbuma)) ## [v0.62.0](https://github.com/materialsproject/maggma/tree/v0.62.0) (2024-02-02) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.61.1...v0.62.0) **Merged pull requests:** - updating for open data format change [\#911](https://github.com/materialsproject/maggma/pull/911) ([kbuma](https://github.com/kbuma)) ## [v0.61.1](https://github.com/materialsproject/maggma/tree/v0.61.1) (2024-01-30) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.61.0...v0.61.1) **Merged pull requests:** - Make get by key default false [\#910](https://github.com/materialsproject/maggma/pull/910) ([munrojm](https://github.com/munrojm)) ## [v0.61.0](https://github.com/materialsproject/maggma/tree/v0.61.0) (2024-01-19) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.60.2...v0.61.0) **Closed issues:** - `DeprecationWarning` associated with `pkg_resources` [\#903](https://github.com/materialsproject/maggma/issues/903) **Merged pull requests:** - creating PandasMemoryStore for use by OpenDataStore [\#908](https://github.com/materialsproject/maggma/pull/908) ([kbuma](https://github.com/kbuma)) ## [v0.60.2](https://github.com/materialsproject/maggma/tree/v0.60.2) (2024-01-05) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.60.1...v0.60.2) **Merged pull requests:** - rm deprecated pkg\_resources [\#905](https://github.com/materialsproject/maggma/pull/905) ([rkingsbury](https://github.com/rkingsbury)) ## [v0.60.1](https://github.com/materialsproject/maggma/tree/v0.60.1) (2024-01-05) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.60.0...v0.60.1) **Implemented enhancements:** - Patch s3 kwarg [\#900](https://github.com/materialsproject/maggma/pull/900) ([jmmshn](https://github.com/jmmshn)) **Merged pull requests:** - special casing for thermo, xas and synth\_descriptions collections in OpenData [\#904](https://github.com/materialsproject/maggma/pull/904) ([kbuma](https://github.com/kbuma)) - linting fixes [\#901](https://github.com/materialsproject/maggma/pull/901) ([jmmshn](https://github.com/jmmshn)) ## [v0.60.0](https://github.com/materialsproject/maggma/tree/v0.60.0) (2023-12-15) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.59.0...v0.60.0) **Merged pull requests:** - fixing OpenDataStore to pickle correctly [\#897](https://github.com/materialsproject/maggma/pull/897) ([kbuma](https://github.com/kbuma)) ## [v0.59.0](https://github.com/materialsproject/maggma/tree/v0.59.0) (2023-12-11) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.58.0...v0.59.0) **Merged pull requests:** - Enhancement/open data store [\#893](https://github.com/materialsproject/maggma/pull/893) ([kbuma](https://github.com/kbuma)) ## [v0.58.0](https://github.com/materialsproject/maggma/tree/v0.58.0) (2023-11-21) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.57.10...v0.58.0) **Implemented enhancements:** - SSH tunnel support for S3Store [\#882](https://github.com/materialsproject/maggma/pull/882) ([mjwen](https://github.com/mjwen)) **Merged pull requests:** - update package metadata in pyproject.toml [\#892](https://github.com/materialsproject/maggma/pull/892) ([rkingsbury](https://github.com/rkingsbury)) ## [v0.57.10](https://github.com/materialsproject/maggma/tree/v0.57.10) (2023-11-17) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.57.9...v0.57.10) **Merged pull requests:** - Remove key from sorting by default [\#890](https://github.com/materialsproject/maggma/pull/890) ([munrojm](https://github.com/munrojm)) ## [v0.57.9](https://github.com/materialsproject/maggma/tree/v0.57.9) (2023-11-16) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.57.8...v0.57.9) **Merged pull requests:** - Remove hint in count for S3Store [\#888](https://github.com/materialsproject/maggma/pull/888) ([munrojm](https://github.com/munrojm)) - Add missing `MontyStore` to list of stores [\#887](https://github.com/materialsproject/maggma/pull/887) ([Andrew-S-Rosen](https://github.com/Andrew-S-Rosen)) ## [v0.57.8](https://github.com/materialsproject/maggma/tree/v0.57.8) (2023-11-09) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.57.7...v0.57.8) **Merged pull requests:** - Fix aggregation pipeline kwargs [\#884](https://github.com/materialsproject/maggma/pull/884) ([munrojm](https://github.com/munrojm)) ## [v0.57.7](https://github.com/materialsproject/maggma/tree/v0.57.7) (2023-11-07) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.57.6...v0.57.7) **Merged pull requests:** - Update hint\_scheme [\#883](https://github.com/materialsproject/maggma/pull/883) ([munrojm](https://github.com/munrojm)) ## [v0.57.6](https://github.com/materialsproject/maggma/tree/v0.57.6) (2023-11-07) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.57.5...v0.57.6) **Merged pull requests:** - Ensure sort stage is after match in agg pipeline [\#881](https://github.com/materialsproject/maggma/pull/881) ([munrojm](https://github.com/munrojm)) ## [v0.57.5](https://github.com/materialsproject/maggma/tree/v0.57.5) (2023-11-04) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.57.4...v0.57.5) **Implemented enhancements:** - Store.connect: fix force\_reset kwarg implementations [\#879](https://github.com/materialsproject/maggma/pull/879) ([rkingsbury](https://github.com/rkingsbury)) **Merged pull requests:** - chore: fix typos [\#877](https://github.com/materialsproject/maggma/pull/877) ([e-kwsm](https://github.com/e-kwsm)) - Automated dependency upgrades [\#875](https://github.com/materialsproject/maggma/pull/875) ([github-actions[bot]](https://github.com/apps/github-actions)) ## [v0.57.4](https://github.com/materialsproject/maggma/tree/v0.57.4) (2023-10-13) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.57.3...v0.57.4) **Merged pull requests:** - Fix header processing with enabled validation [\#871](https://github.com/materialsproject/maggma/pull/871) ([munrojm](https://github.com/munrojm)) ## [v0.57.3](https://github.com/materialsproject/maggma/tree/v0.57.3) (2023-10-12) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.57.2...v0.57.3) **Merged pull requests:** - Ensure header processor alters the correct object [\#870](https://github.com/materialsproject/maggma/pull/870) ([munrojm](https://github.com/munrojm)) ## [v0.57.2](https://github.com/materialsproject/maggma/tree/v0.57.2) (2023-10-09) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.57.1...v0.57.2) **Closed issues:** - \[Feature Request\]: Is there a specific reason why pyzmq is fixed to 24.0.1 rather than supporting more recent versions ? [\#867](https://github.com/materialsproject/maggma/issues/867) **Merged pull requests:** - Remove generic model reference [\#869](https://github.com/materialsproject/maggma/pull/869) ([munrojm](https://github.com/munrojm)) - Automated dependency upgrades [\#868](https://github.com/materialsproject/maggma/pull/868) ([github-actions[bot]](https://github.com/apps/github-actions)) ## [v0.57.1](https://github.com/materialsproject/maggma/tree/v0.57.1) (2023-10-05) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.57.0...v0.57.1) **Closed issues:** - Support for Pydantic 2 [\#858](https://github.com/materialsproject/maggma/issues/858) ## [v0.57.0](https://github.com/materialsproject/maggma/tree/v0.57.0) (2023-09-26) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.56.0...v0.57.0) **Merged pull requests:** - Pydantic 2.0 [\#865](https://github.com/materialsproject/maggma/pull/865) ([munrojm](https://github.com/munrojm)) - Revert "Automated dependency upgrades" [\#862](https://github.com/materialsproject/maggma/pull/862) ([rkingsbury](https://github.com/rkingsbury)) - CI: add changelog template and prevent duplicate GH Action releases [\#861](https://github.com/materialsproject/maggma/pull/861) ([rkingsbury](https://github.com/rkingsbury)) - Automated dependency upgrades [\#860](https://github.com/materialsproject/maggma/pull/860) ([github-actions[bot]](https://github.com/apps/github-actions)) - Update @arosen93 to @Andrew-S-Rosen [\#859](https://github.com/materialsproject/maggma/pull/859) ([Andrew-S-Rosen](https://github.com/Andrew-S-Rosen)) ## [v0.56.0](https://github.com/materialsproject/maggma/tree/v0.56.0) (2023-09-06) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.55.0...v0.56.0) ## [v0.55.0](https://github.com/materialsproject/maggma/tree/v0.55.0) (2023-09-06) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.54.0...v0.55.0) **Closed issues:** - Would the maggma docs be a good place to host MongoDB setup instructions? [\#845](https://github.com/materialsproject/maggma/issues/845) **Merged pull requests:** - Automated dependency upgrades [\#856](https://github.com/materialsproject/maggma/pull/856) ([github-actions[bot]](https://github.com/apps/github-actions)) - migrate dependencies to setup.py and update CI config [\#855](https://github.com/materialsproject/maggma/pull/855) ([rkingsbury](https://github.com/rkingsbury)) - Automated dependency upgrades [\#854](https://github.com/materialsproject/maggma/pull/854) ([github-actions[bot]](https://github.com/apps/github-actions)) - Fix broken link in README.md [\#853](https://github.com/materialsproject/maggma/pull/853) ([Andrew-S-Rosen](https://github.com/Andrew-S-Rosen)) - Create dependabot.yml to update GitHub actions packages [\#852](https://github.com/materialsproject/maggma/pull/852) ([Andrew-S-Rosen](https://github.com/Andrew-S-Rosen)) - Remove tests for Python 3.7 since it reached its end-of-life [\#851](https://github.com/materialsproject/maggma/pull/851) ([Andrew-S-Rosen](https://github.com/Andrew-S-Rosen)) - Update CI pipeline so repeated commits don't cause concurrent tests [\#850](https://github.com/materialsproject/maggma/pull/850) ([Andrew-S-Rosen](https://github.com/Andrew-S-Rosen)) - Add a "Setting up MongoDB" guide to the docs and update README [\#849](https://github.com/materialsproject/maggma/pull/849) ([Andrew-S-Rosen](https://github.com/Andrew-S-Rosen)) - CI: use OS-specific requirements in testing [\#841](https://github.com/materialsproject/maggma/pull/841) ([rkingsbury](https://github.com/rkingsbury)) ## [v0.54.0](https://github.com/materialsproject/maggma/tree/v0.54.0) (2023-08-29) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.53.1...v0.54.0) **Merged pull requests:** - Automated dependency upgrades [\#848](https://github.com/materialsproject/maggma/pull/848) ([github-actions[bot]](https://github.com/apps/github-actions)) - JSONStore: enabled reading of MongoDB extended JSON files [\#847](https://github.com/materialsproject/maggma/pull/847) ([rkingsbury](https://github.com/rkingsbury)) - Automated dependency upgrades [\#844](https://github.com/materialsproject/maggma/pull/844) ([github-actions[bot]](https://github.com/apps/github-actions)) ## [v0.53.1](https://github.com/materialsproject/maggma/tree/v0.53.1) (2023-08-15) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.53.0...v0.53.1) **Merged pull requests:** - Aws store botocore fix [\#843](https://github.com/materialsproject/maggma/pull/843) ([tsmathis](https://github.com/tsmathis)) - Automated dependency upgrades [\#842](https://github.com/materialsproject/maggma/pull/842) ([github-actions[bot]](https://github.com/apps/github-actions)) - CI: small update to auto dependency workflow [\#840](https://github.com/materialsproject/maggma/pull/840) ([rkingsbury](https://github.com/rkingsbury)) - Automated dependency upgrades [\#839](https://github.com/materialsproject/maggma/pull/839) ([github-actions[bot]](https://github.com/apps/github-actions)) ## [v0.53.0](https://github.com/materialsproject/maggma/tree/v0.53.0) (2023-08-02) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.52.1...v0.53.0) **Merged pull requests:** - allow \>5GB and turn on multi-part uploads for AWS [\#829](https://github.com/materialsproject/maggma/pull/829) ([kbuma](https://github.com/kbuma)) ## [v0.52.1](https://github.com/materialsproject/maggma/tree/v0.52.1) (2023-08-02) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.52.2...v0.52.1) ## [v0.52.2](https://github.com/materialsproject/maggma/tree/v0.52.2) (2023-08-02) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.52.0...v0.52.2) **Merged pull requests:** - Allow maggma to be used without Azure [\#837](https://github.com/materialsproject/maggma/pull/837) ([jmmshn](https://github.com/jmmshn)) - rm merge-me action [\#836](https://github.com/materialsproject/maggma/pull/836) ([rkingsbury](https://github.com/rkingsbury)) - Automated dependency upgrades [\#835](https://github.com/materialsproject/maggma/pull/835) ([rkingsbury](https://github.com/rkingsbury)) ## [v0.52.0](https://github.com/materialsproject/maggma/tree/v0.52.0) (2023-07-31) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.51.25...v0.52.0) ## [v0.51.25](https://github.com/materialsproject/maggma/tree/v0.51.25) (2023-07-27) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.51.24...v0.51.25) **Merged pull requests:** - Some cleanup: `isort`, updated classifiers, remove unused kwarg [\#833](https://github.com/materialsproject/maggma/pull/833) ([Andrew-S-Rosen](https://github.com/Andrew-S-Rosen)) ## [v0.51.24](https://github.com/materialsproject/maggma/tree/v0.51.24) (2023-07-21) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.51.23...v0.51.24) ## [v0.51.23](https://github.com/materialsproject/maggma/tree/v0.51.23) (2023-07-21) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.51.22...v0.51.23) **Closed issues:** - `database_name` of `MontyStore` doesn't seem to update the name [\#820](https://github.com/materialsproject/maggma/issues/820) **Merged pull requests:** - FileStore performance enhancements [\#824](https://github.com/materialsproject/maggma/pull/824) ([rkingsbury](https://github.com/rkingsbury)) ## [v0.51.22](https://github.com/materialsproject/maggma/tree/v0.51.22) (2023-07-21) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.51.21...v0.51.22) **Closed issues:** - Instantiating a `Store` from a dictionary representation [\#825](https://github.com/materialsproject/maggma/issues/825) **Merged pull requests:** - misc. MontyStore improvements [\#827](https://github.com/materialsproject/maggma/pull/827) ([rkingsbury](https://github.com/rkingsbury)) ## [v0.51.21](https://github.com/materialsproject/maggma/tree/v0.51.21) (2023-07-11) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.51.20...v0.51.21) ## [v0.51.20](https://github.com/materialsproject/maggma/tree/v0.51.20) (2023-07-11) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.51.19...v0.51.20) **Merged pull requests:** - Fixe ruamel.yaml dependency [\#823](https://github.com/materialsproject/maggma/pull/823) ([jmmshn](https://github.com/jmmshn)) ## [v0.51.19](https://github.com/materialsproject/maggma/tree/v0.51.19) (2023-07-11) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.51.18...v0.51.19) **Merged pull requests:** - Update setup.py [\#822](https://github.com/materialsproject/maggma/pull/822) ([jmmshn](https://github.com/jmmshn)) ## [v0.51.18](https://github.com/materialsproject/maggma/tree/v0.51.18) (2023-07-10) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.51.17...v0.51.18) **Merged pull requests:** - Add `MontyStore` to `maggma.stores.__init__` [\#819](https://github.com/materialsproject/maggma/pull/819) ([Andrew-S-Rosen](https://github.com/Andrew-S-Rosen)) ## [v0.51.17](https://github.com/materialsproject/maggma/tree/v0.51.17) (2023-07-07) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.51.16...v0.51.17) **Merged pull requests:** - Revert store close apart from S3 [\#818](https://github.com/materialsproject/maggma/pull/818) ([munrojm](https://github.com/munrojm)) ## [v0.51.16](https://github.com/materialsproject/maggma/tree/v0.51.16) (2023-07-07) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.51.15...v0.51.16) **Merged pull requests:** - Fix s3 store collection close handling in resource classes [\#817](https://github.com/materialsproject/maggma/pull/817) ([munrojm](https://github.com/munrojm)) ## [v0.51.15](https://github.com/materialsproject/maggma/tree/v0.51.15) (2023-07-06) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.51.14...v0.51.15) **Merged pull requests:** - Fix collection check [\#816](https://github.com/materialsproject/maggma/pull/816) ([munrojm](https://github.com/munrojm)) ## [v0.51.14](https://github.com/materialsproject/maggma/tree/v0.51.14) (2023-07-06) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.51.13...v0.51.14) **Merged pull requests:** - Check for collection before closing in resources [\#815](https://github.com/materialsproject/maggma/pull/815) ([munrojm](https://github.com/munrojm)) ## [v0.51.13](https://github.com/materialsproject/maggma/tree/v0.51.13) (2023-07-06) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.51.10...v0.51.13) **Merged pull requests:** - Add explicit store close to resources [\#814](https://github.com/materialsproject/maggma/pull/814) ([munrojm](https://github.com/munrojm)) ## [v0.51.10](https://github.com/materialsproject/maggma/tree/v0.51.10) (2023-06-27) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.51.11...v0.51.10) ## [v0.51.11](https://github.com/materialsproject/maggma/tree/v0.51.11) (2023-06-27) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.51.12...v0.51.11) ## [v0.51.12](https://github.com/materialsproject/maggma/tree/v0.51.12) (2023-06-27) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.51.9...v0.51.12) **Merged pull requests:** - fix the response field [\#811](https://github.com/materialsproject/maggma/pull/811) ([yang-ruoxi](https://github.com/yang-ruoxi)) ## [v0.51.9](https://github.com/materialsproject/maggma/tree/v0.51.9) (2023-06-22) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.51.8...v0.51.9) **Fixed bugs:** - python 3.11 CI test failure with AzureBlobStore [\#807](https://github.com/materialsproject/maggma/issues/807) **Merged pull requests:** - add patch method for submission resource [\#809](https://github.com/materialsproject/maggma/pull/809) ([yang-ruoxi](https://github.com/yang-ruoxi)) ## [v0.51.8](https://github.com/materialsproject/maggma/tree/v0.51.8) (2023-06-14) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.51.7...v0.51.8) **Implemented enhancements:** - Memray memory profiler support for mrun command line tool [\#794](https://github.com/materialsproject/maggma/pull/794) ([tsmathis](https://github.com/tsmathis)) **Closed issues:** - `MontyStore` cannot be used with a pre-existing local DB [\#796](https://github.com/materialsproject/maggma/issues/796) **Merged pull requests:** - Fixing bug in azure multi worker test [\#808](https://github.com/materialsproject/maggma/pull/808) ([gpetretto](https://github.com/gpetretto)) - Clarify docstring for `MontyDB` and add missing `self.database` property [\#806](https://github.com/materialsproject/maggma/pull/806) ([Andrew-S-Rosen](https://github.com/Andrew-S-Rosen)) - Add CodeQL workflow for GitHub code scanning [\#743](https://github.com/materialsproject/maggma/pull/743) ([lgtm-com[bot]](https://github.com/apps/lgtm-com)) ## [v0.51.7](https://github.com/materialsproject/maggma/tree/v0.51.7) (2023-06-12) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.51.6...v0.51.7) **Merged pull requests:** - Explicitly close s3 client connections in `S3Store` [\#805](https://github.com/materialsproject/maggma/pull/805) ([munrojm](https://github.com/munrojm)) ## [v0.51.6](https://github.com/materialsproject/maggma/tree/v0.51.6) (2023-06-08) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.51.5...v0.51.6) **Merged pull requests:** - Use tqdm.auto [\#795](https://github.com/materialsproject/maggma/pull/795) ([utf](https://github.com/utf)) ## [v0.51.5](https://github.com/materialsproject/maggma/tree/v0.51.5) (2023-06-06) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.51.4...v0.51.5) **Merged pull requests:** - Disable worker timeouts by default [\#793](https://github.com/materialsproject/maggma/pull/793) ([munrojm](https://github.com/munrojm)) ## [v0.51.4](https://github.com/materialsproject/maggma/tree/v0.51.4) (2023-06-02) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.51.3...v0.51.4) **Merged pull requests:** - modify JSONStore file creation [\#792](https://github.com/materialsproject/maggma/pull/792) ([gpetretto](https://github.com/gpetretto)) ## [v0.51.3](https://github.com/materialsproject/maggma/tree/v0.51.3) (2023-05-29) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.51.2...v0.51.3) ## [v0.51.2](https://github.com/materialsproject/maggma/tree/v0.51.2) (2023-05-29) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.51.1...v0.51.2) **Merged pull requests:** - Add orjson options in JSONStore [\#791](https://github.com/materialsproject/maggma/pull/791) ([gpetretto](https://github.com/gpetretto)) - Implementation of an AzureBlobStore for Azure blobs [\#790](https://github.com/materialsproject/maggma/pull/790) ([gpetretto](https://github.com/gpetretto)) ## [v0.51.1](https://github.com/materialsproject/maggma/tree/v0.51.1) (2023-05-22) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.51.0...v0.51.1) **Merged pull requests:** - Add ruamel-yaml as a dependency [\#789](https://github.com/materialsproject/maggma/pull/789) ([sivonxay](https://github.com/sivonxay)) ## [v0.51.0](https://github.com/materialsproject/maggma/tree/v0.51.0) (2023-05-22) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.50.4...v0.51.0) **Merged pull requests:** - Create a MultiStore object and a Store-like object to access it [\#787](https://github.com/materialsproject/maggma/pull/787) ([sivonxay](https://github.com/sivonxay)) ## [v0.50.4](https://github.com/materialsproject/maggma/tree/v0.50.4) (2023-04-28) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.50.3...v0.50.4) **Merged pull requests:** - Update MongoStore `count` method [\#785](https://github.com/materialsproject/maggma/pull/785) ([munrojm](https://github.com/munrojm)) ## [v0.50.3](https://github.com/materialsproject/maggma/tree/v0.50.3) (2023-02-17) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.50.2...v0.50.3) **Merged pull requests:** - Remove extra heartbeats from workers [\#779](https://github.com/materialsproject/maggma/pull/779) ([munrojm](https://github.com/munrojm)) ## [v0.50.2](https://github.com/materialsproject/maggma/tree/v0.50.2) (2023-02-17) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.50.1...v0.50.2) **Merged pull requests:** - Pydantic CLI settings [\#778](https://github.com/materialsproject/maggma/pull/778) ([munrojm](https://github.com/munrojm)) ## [v0.50.1](https://github.com/materialsproject/maggma/tree/v0.50.1) (2023-02-16) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.50.0...v0.50.1) **Merged pull requests:** - Remove stray print in worker debug [\#777](https://github.com/materialsproject/maggma/pull/777) ([munrojm](https://github.com/munrojm)) ## [v0.50.0](https://github.com/materialsproject/maggma/tree/v0.50.0) (2023-02-16) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.49.18...v0.50.0) **Merged pull requests:** - Overhaul distributed framework and add RabbitMQ support [\#776](https://github.com/materialsproject/maggma/pull/776) ([munrojm](https://github.com/munrojm)) ## [v0.49.18](https://github.com/materialsproject/maggma/tree/v0.49.18) (2023-02-13) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.49.17...v0.49.18) **Merged pull requests:** - Add more heartbeat pings from worker [\#775](https://github.com/materialsproject/maggma/pull/775) ([munrojm](https://github.com/munrojm)) ## [v0.49.17](https://github.com/materialsproject/maggma/tree/v0.49.17) (2023-01-30) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.49.16...v0.49.17) **Merged pull requests:** - Remove default sorting from API [\#770](https://github.com/materialsproject/maggma/pull/770) ([munrojm](https://github.com/munrojm)) ## [v0.49.16](https://github.com/materialsproject/maggma/tree/v0.49.16) (2023-01-23) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.49.15...v0.49.16) **Merged pull requests:** - Query pipeline out of memory fix [\#767](https://github.com/materialsproject/maggma/pull/767) ([munrojm](https://github.com/munrojm)) ## [v0.49.15](https://github.com/materialsproject/maggma/tree/v0.49.15) (2023-01-23) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.49.14...v0.49.15) **Merged pull requests:** - Fix server-side API sorting [\#766](https://github.com/materialsproject/maggma/pull/766) ([munrojm](https://github.com/munrojm)) ## [v0.49.14](https://github.com/materialsproject/maggma/tree/v0.49.14) (2023-01-18) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.49.13...v0.49.14) **Merged pull requests:** - Fix S3 store queries in API [\#761](https://github.com/materialsproject/maggma/pull/761) ([munrojm](https://github.com/munrojm)) ## [v0.49.13](https://github.com/materialsproject/maggma/tree/v0.49.13) (2023-01-10) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.49.12...v0.49.13) **Merged pull requests:** - Aggregation pipelines in resource classes [\#759](https://github.com/materialsproject/maggma/pull/759) ([munrojm](https://github.com/munrojm)) ## [v0.49.12](https://github.com/materialsproject/maggma/tree/v0.49.12) (2023-01-09) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.49.11...v0.49.12) **Merged pull requests:** - Add default sort parameter to `MongoStore` [\#758](https://github.com/materialsproject/maggma/pull/758) ([munrojm](https://github.com/munrojm)) ## [v0.49.11](https://github.com/materialsproject/maggma/tree/v0.49.11) (2022-12-15) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.49.10...v0.49.11) **Merged pull requests:** - Async to sync for fastapi funcs [\#750](https://github.com/materialsproject/maggma/pull/750) ([munrojm](https://github.com/munrojm)) ## [v0.49.10](https://github.com/materialsproject/maggma/tree/v0.49.10) (2022-12-07) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.49.9...v0.49.10) **Merged pull requests:** - Enable disk use in mongo find [\#749](https://github.com/materialsproject/maggma/pull/749) ([munrojm](https://github.com/munrojm)) ## [v0.49.9](https://github.com/materialsproject/maggma/tree/v0.49.9) (2022-11-01) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.49.8...v0.49.9) **Merged pull requests:** - Parse datetime with dateutil [\#741](https://github.com/materialsproject/maggma/pull/741) ([munrojm](https://github.com/munrojm)) ## [v0.49.8](https://github.com/materialsproject/maggma/tree/v0.49.8) (2022-10-25) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.49.7...v0.49.8) ## [v0.49.7](https://github.com/materialsproject/maggma/tree/v0.49.7) (2022-10-25) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.49.6...v0.49.7) **Merged pull requests:** - FileStore: fix metadata overwriting path [\#736](https://github.com/materialsproject/maggma/pull/736) ([rkingsbury](https://github.com/rkingsbury)) - JSONStore: fix last\_updated serialization problem [\#735](https://github.com/materialsproject/maggma/pull/735) ([rkingsbury](https://github.com/rkingsbury)) ## [v0.49.6](https://github.com/materialsproject/maggma/tree/v0.49.6) (2022-10-21) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.49.5...v0.49.6) **Merged pull requests:** - Default sort on \_id for determinacy [\#732](https://github.com/materialsproject/maggma/pull/732) ([munrojm](https://github.com/munrojm)) ## [v0.49.5](https://github.com/materialsproject/maggma/tree/v0.49.5) (2022-09-30) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.49.4...v0.49.5) **Merged pull requests:** - Up manager timeout [\#718](https://github.com/materialsproject/maggma/pull/718) ([munrojm](https://github.com/munrojm)) ## [v0.49.4](https://github.com/materialsproject/maggma/tree/v0.49.4) (2022-09-28) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.49.3...v0.49.4) **Merged pull requests:** - Up worker timeout [\#717](https://github.com/materialsproject/maggma/pull/717) ([munrojm](https://github.com/munrojm)) ## [v0.49.3](https://github.com/materialsproject/maggma/tree/v0.49.3) (2022-09-27) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.49.2...v0.49.3) **Merged pull requests:** - Update high water mark [\#716](https://github.com/materialsproject/maggma/pull/716) ([munrojm](https://github.com/munrojm)) ## [v0.49.2](https://github.com/materialsproject/maggma/tree/v0.49.2) (2022-09-27) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.49.1...v0.49.2) **Merged pull requests:** - Fix stalling in distributed code [\#715](https://github.com/materialsproject/maggma/pull/715) ([munrojm](https://github.com/munrojm)) ## [v0.49.1](https://github.com/materialsproject/maggma/tree/v0.49.1) (2022-09-26) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.49.0...v0.49.1) **Merged pull requests:** - Send proper exit message to workers [\#714](https://github.com/materialsproject/maggma/pull/714) ([munrojm](https://github.com/munrojm)) ## [v0.49.0](https://github.com/materialsproject/maggma/tree/v0.49.0) (2022-09-23) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.48.1...v0.49.0) **Merged pull requests:** - Enhance distributed builder code [\#711](https://github.com/materialsproject/maggma/pull/711) ([munrojm](https://github.com/munrojm)) ## [v0.48.1](https://github.com/materialsproject/maggma/tree/v0.48.1) (2022-09-02) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.48.0...v0.48.1) **Merged pull requests:** - Add ssh\_tunnel option to GridFSStore [\#707](https://github.com/materialsproject/maggma/pull/707) ([utf](https://github.com/utf)) ## [v0.48.0](https://github.com/materialsproject/maggma/tree/v0.48.0) (2022-08-04) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.47.6...v0.48.0) **Merged pull requests:** - Proposal: remove Drone class [\#669](https://github.com/materialsproject/maggma/pull/669) ([rkingsbury](https://github.com/rkingsbury)) ## [v0.47.6](https://github.com/materialsproject/maggma/tree/v0.47.6) (2022-08-04) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.47.5...v0.47.6) **Merged pull requests:** - Docs: add mermaid diagram support [\#677](https://github.com/materialsproject/maggma/pull/677) ([rkingsbury](https://github.com/rkingsbury)) ## [v0.47.5](https://github.com/materialsproject/maggma/tree/v0.47.5) (2022-07-26) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.47.4...v0.47.5) **Merged pull requests:** - Add pymongo timeout context to queries [\#691](https://github.com/materialsproject/maggma/pull/691) ([munrojm](https://github.com/munrojm)) ## [v0.47.4](https://github.com/materialsproject/maggma/tree/v0.47.4) (2022-07-25) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.47.3...v0.47.4) **Merged pull requests:** - Ensure all fields are properly sanitized [\#690](https://github.com/materialsproject/maggma/pull/690) ([munrojm](https://github.com/munrojm)) ## [v0.47.3](https://github.com/materialsproject/maggma/tree/v0.47.3) (2022-06-07) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.47.2...v0.47.3) **Merged pull requests:** - Fix header processing [\#679](https://github.com/materialsproject/maggma/pull/679) ([munrojm](https://github.com/munrojm)) ## [v0.47.2](https://github.com/materialsproject/maggma/tree/v0.47.2) (2022-05-27) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.47.1...v0.47.2) **Merged pull requests:** - Docs updates: add FileStore and misc edits [\#668](https://github.com/materialsproject/maggma/pull/668) ([rkingsbury](https://github.com/rkingsbury)) ## [v0.47.1](https://github.com/materialsproject/maggma/tree/v0.47.1) (2022-05-24) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.47.0...v0.47.1) **Merged pull requests:** - Fix gridfs URI store [\#667](https://github.com/materialsproject/maggma/pull/667) ([utf](https://github.com/utf)) ## [v0.47.0](https://github.com/materialsproject/maggma/tree/v0.47.0) (2022-05-23) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.46.2...v0.47.0) **Merged pull requests:** - FileStore: a Store for files on disk [\#619](https://github.com/materialsproject/maggma/pull/619) ([rkingsbury](https://github.com/rkingsbury)) ## [v0.46.2](https://github.com/materialsproject/maggma/tree/v0.46.2) (2022-05-23) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.46.1...v0.46.2) **Merged pull requests:** - allow s3 resource kwargs [\#665](https://github.com/materialsproject/maggma/pull/665) ([jmmshn](https://github.com/jmmshn)) ## [v0.46.1](https://github.com/materialsproject/maggma/tree/v0.46.1) (2022-04-21) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.46.0...v0.46.1) **Merged pull requests:** - Prefix `fields` input for read resource key endpoint [\#636](https://github.com/materialsproject/maggma/pull/636) ([munrojm](https://github.com/munrojm)) ## [v0.46.0](https://github.com/materialsproject/maggma/tree/v0.46.0) (2022-04-19) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.45.1...v0.46.0) **Merged pull requests:** - S3 store and resource additions [\#635](https://github.com/materialsproject/maggma/pull/635) ([munrojm](https://github.com/munrojm)) ## [v0.45.1](https://github.com/materialsproject/maggma/tree/v0.45.1) (2022-04-18) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.45.0...v0.45.1) **Merged pull requests:** - minor bug fix in remove\_docs [\#626](https://github.com/materialsproject/maggma/pull/626) ([jmmshn](https://github.com/jmmshn)) ## [v0.45.0](https://github.com/materialsproject/maggma/tree/v0.45.0) (2022-04-14) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.44.5...v0.45.0) **Merged pull requests:** - Changes to core query operators and API [\#620](https://github.com/materialsproject/maggma/pull/620) ([munrojm](https://github.com/munrojm)) ## [v0.44.5](https://github.com/materialsproject/maggma/tree/v0.44.5) (2022-04-12) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.44.4...v0.44.5) **Merged pull requests:** - JSONStore: file\_writable -\> read\_only [\#625](https://github.com/materialsproject/maggma/pull/625) ([rkingsbury](https://github.com/rkingsbury)) ## [v0.44.4](https://github.com/materialsproject/maggma/tree/v0.44.4) (2022-04-12) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.44.3...v0.44.4) **Merged pull requests:** - JSONStore: write file on init, add descriptive KeyError, add tests [\#624](https://github.com/materialsproject/maggma/pull/624) ([rkingsbury](https://github.com/rkingsbury)) ## [v0.44.3](https://github.com/materialsproject/maggma/tree/v0.44.3) (2022-04-11) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.44.2...v0.44.3) **Merged pull requests:** - MemoryStore: fix groupby ignoring properties [\#621](https://github.com/materialsproject/maggma/pull/621) ([rkingsbury](https://github.com/rkingsbury)) ## [v0.44.2](https://github.com/materialsproject/maggma/tree/v0.44.2) (2022-04-05) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.44.1...v0.44.2) **Merged pull requests:** - Force post-process method to take in query params [\#618](https://github.com/materialsproject/maggma/pull/618) ([munrojm](https://github.com/munrojm)) ## [v0.44.1](https://github.com/materialsproject/maggma/tree/v0.44.1) (2022-03-08) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.44.0...v0.44.1) **Merged pull requests:** - added localhost test for MongoURIStore [\#595](https://github.com/materialsproject/maggma/pull/595) ([jmmshn](https://github.com/jmmshn)) ## [v0.44.0](https://github.com/materialsproject/maggma/tree/v0.44.0) (2022-03-07) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.43.0...v0.44.0) ## [v0.43.0](https://github.com/materialsproject/maggma/tree/v0.43.0) (2022-03-07) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.42.0...v0.43.0) ## [v0.42.0](https://github.com/materialsproject/maggma/tree/v0.42.0) (2022-03-07) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.41.1...v0.42.0) **Merged pull requests:** - Remove python3.6 support and fix tests [\#579](https://github.com/materialsproject/maggma/pull/579) ([munrojm](https://github.com/munrojm)) - typo [\#576](https://github.com/materialsproject/maggma/pull/576) ([jmmshn](https://github.com/jmmshn)) ## [v0.41.1](https://github.com/materialsproject/maggma/tree/v0.41.1) (2022-03-05) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.41.0...v0.41.1) **Merged pull requests:** - mongoclient\_kwargs [\#575](https://github.com/materialsproject/maggma/pull/575) ([jmmshn](https://github.com/jmmshn)) - change cleint -\> resource in aws tests [\#574](https://github.com/materialsproject/maggma/pull/574) ([jmmshn](https://github.com/jmmshn)) ## [v0.41.0](https://github.com/materialsproject/maggma/tree/v0.41.0) (2022-02-15) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.40.0...v0.41.0) **Merged pull requests:** - Add header processing abilities to certain `Resource` classes [\#569](https://github.com/materialsproject/maggma/pull/569) ([munrojm](https://github.com/munrojm)) ## [v0.40.0](https://github.com/materialsproject/maggma/tree/v0.40.0) (2022-02-10) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.39.1...v0.40.0) **Merged pull requests:** - Add authsource option for mongo and gridfs stores [\#567](https://github.com/materialsproject/maggma/pull/567) ([utf](https://github.com/utf)) ## [v0.39.1](https://github.com/materialsproject/maggma/tree/v0.39.1) (2022-01-27) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.39.0...v0.39.1) **Fixed bugs:** - Single import-dependence on pynng causes M1 Mac install error [\#528](https://github.com/materialsproject/maggma/issues/528) **Merged pull requests:** - Add boto3 to required packages [\#544](https://github.com/materialsproject/maggma/pull/544) ([munrojm](https://github.com/munrojm)) ## [v0.39.0](https://github.com/materialsproject/maggma/tree/v0.39.0) (2022-01-26) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.38.1...v0.39.0) **Merged pull requests:** - Replace `pynng` functionality with `pyzmq` [\#543](https://github.com/materialsproject/maggma/pull/543) ([munrojm](https://github.com/munrojm)) - Encode `_` as `--` in metadata when using `S3Store.write_doc_to_s3` [\#532](https://github.com/materialsproject/maggma/pull/532) ([mkhorton](https://github.com/mkhorton)) ## [v0.38.1](https://github.com/materialsproject/maggma/tree/v0.38.1) (2021-12-10) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.38.0...v0.38.1) **Merged pull requests:** - Add ability to input index hints to count method [\#524](https://github.com/materialsproject/maggma/pull/524) ([munrojm](https://github.com/munrojm)) ## [v0.38.0](https://github.com/materialsproject/maggma/tree/v0.38.0) (2021-12-09) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.37.0...v0.38.0) **Merged pull requests:** - Fix issue with `close` and `MongoStore` and update `_collection` attribute [\#523](https://github.com/materialsproject/maggma/pull/523) ([munrojm](https://github.com/munrojm)) ## [v0.37.0](https://github.com/materialsproject/maggma/tree/v0.37.0) (2021-12-07) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.36.0...v0.37.0) **Merged pull requests:** - Revert broken MongoStore auth testing [\#522](https://github.com/materialsproject/maggma/pull/522) ([munrojm](https://github.com/munrojm)) - Fix authentication for `MongoStore` to work with `pymongo==4.0` [\#521](https://github.com/materialsproject/maggma/pull/521) ([munrojm](https://github.com/munrojm)) ## [v0.36.0](https://github.com/materialsproject/maggma/tree/v0.36.0) (2021-12-06) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.35.0...v0.36.0) **Merged pull requests:** - Added on-disk MongoDB compatible MontyStore [\#514](https://github.com/materialsproject/maggma/pull/514) ([utf](https://github.com/utf)) ## [v0.35.0](https://github.com/materialsproject/maggma/tree/v0.35.0) (2021-12-01) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.34.0...v0.35.0) ## [v0.34.0](https://github.com/materialsproject/maggma/tree/v0.34.0) (2021-12-01) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.33.2...v0.34.0) **Merged pull requests:** - Changes to accommodate new pymongo [\#517](https://github.com/materialsproject/maggma/pull/517) ([munrojm](https://github.com/munrojm)) ## [v0.33.2](https://github.com/materialsproject/maggma/tree/v0.33.2) (2021-12-01) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.33.1...v0.33.2) **Merged pull requests:** - Patch mongo store connect methods [\#516](https://github.com/materialsproject/maggma/pull/516) ([munrojm](https://github.com/munrojm)) ## [v0.33.1](https://github.com/materialsproject/maggma/tree/v0.33.1) (2021-12-01) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.33.0...v0.33.1) **Merged pull requests:** - Patch memory store connect method [\#515](https://github.com/materialsproject/maggma/pull/515) ([munrojm](https://github.com/munrojm)) ## [v0.33.0](https://github.com/materialsproject/maggma/tree/v0.33.0) (2021-11-30) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.32.3...v0.33.0) **Merged pull requests:** - MongoDB hint support [\#513](https://github.com/materialsproject/maggma/pull/513) ([munrojm](https://github.com/munrojm)) ## [v0.32.3](https://github.com/materialsproject/maggma/tree/v0.32.3) (2021-11-25) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.32.2...v0.32.3) **Merged pull requests:** - Added option for writable JSONStores \(for single JSON files only\). [\#507](https://github.com/materialsproject/maggma/pull/507) ([davidwaroquiers](https://github.com/davidwaroquiers)) ## [v0.32.2](https://github.com/materialsproject/maggma/tree/v0.32.2) (2021-11-23) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.32.1...v0.32.2) **Merged pull requests:** - Alter sorting query operator to take comma delimited string [\#510](https://github.com/materialsproject/maggma/pull/510) ([munrojm](https://github.com/munrojm)) ## [v0.32.1](https://github.com/materialsproject/maggma/tree/v0.32.1) (2021-11-10) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.32.0...v0.32.1) **Merged pull requests:** - Default to yaml full loader to fix tests [\#505](https://github.com/materialsproject/maggma/pull/505) ([munrojm](https://github.com/munrojm)) - Add GridFSURIStore with support for URI connections [\#504](https://github.com/materialsproject/maggma/pull/504) ([utf](https://github.com/utf)) ## [v0.32.0](https://github.com/materialsproject/maggma/tree/v0.32.0) (2021-10-11) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.31.0...v0.32.0) **Merged pull requests:** - Update sorting query operator to take multiple fields [\#500](https://github.com/materialsproject/maggma/pull/500) ([munrojm](https://github.com/munrojm)) - Change to S3Store serialization behavior in update\(\) and other Mongolike Store changes [\#493](https://github.com/materialsproject/maggma/pull/493) ([sivonxay](https://github.com/sivonxay)) ## [v0.31.0](https://github.com/materialsproject/maggma/tree/v0.31.0) (2021-08-14) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.30.4...v0.31.0) **Merged pull requests:** - Add from\_launchpad\_file classmethod to MongoStore [\#476](https://github.com/materialsproject/maggma/pull/476) ([sivonxay](https://github.com/sivonxay)) ## [v0.30.4](https://github.com/materialsproject/maggma/tree/v0.30.4) (2021-08-04) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.30.3...v0.30.4) **Merged pull requests:** - Fix documentation in aggregation and sparse fields [\#469](https://github.com/materialsproject/maggma/pull/469) ([munrojm](https://github.com/munrojm)) ## [v0.30.3](https://github.com/materialsproject/maggma/tree/v0.30.3) (2021-08-04) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.30.2...v0.30.3) **Merged pull requests:** - Enable enhanced documentation [\#468](https://github.com/materialsproject/maggma/pull/468) ([munrojm](https://github.com/munrojm)) ## [v0.30.2](https://github.com/materialsproject/maggma/tree/v0.30.2) (2021-07-09) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.30.1...v0.30.2) **Merged pull requests:** - orjson added to setup.py [\#465](https://github.com/materialsproject/maggma/pull/465) ([munrojm](https://github.com/munrojm)) ## [v0.30.1](https://github.com/materialsproject/maggma/tree/v0.30.1) (2021-07-09) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.30.0...v0.30.1) **Merged pull requests:** - Switch from monty to orjson for serialization [\#464](https://github.com/materialsproject/maggma/pull/464) ([munrojm](https://github.com/munrojm)) ## [v0.30.0](https://github.com/materialsproject/maggma/tree/v0.30.0) (2021-07-06) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.29.4...v0.30.0) **Merged pull requests:** - Enable monty encoded direct responses [\#463](https://github.com/materialsproject/maggma/pull/463) ([munrojm](https://github.com/munrojm)) ## [v0.29.4](https://github.com/materialsproject/maggma/tree/v0.29.4) (2021-06-23) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.29.3...v0.29.4) **Merged pull requests:** - BugFix: Manual distinct in MongoStore not using Criteria [\#461](https://github.com/materialsproject/maggma/pull/461) ([shyamd](https://github.com/shyamd)) ## [v0.29.3](https://github.com/materialsproject/maggma/tree/v0.29.3) (2021-06-21) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.29.2...v0.29.3) **Merged pull requests:** - Sort query and query operator meta bug fixes [\#453](https://github.com/materialsproject/maggma/pull/453) ([munrojm](https://github.com/munrojm)) ## [v0.29.2](https://github.com/materialsproject/maggma/tree/v0.29.2) (2021-06-18) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.29.1...v0.29.2) **Merged pull requests:** - Fix API Sanitizing MSONable types in combined types [\#454](https://github.com/materialsproject/maggma/pull/454) ([shyamd](https://github.com/shyamd)) ## [v0.29.1](https://github.com/materialsproject/maggma/tree/v0.29.1) (2021-06-15) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.29.0...v0.29.1) **Merged pull requests:** - Switch from classic bson to pymongo bson [\#452](https://github.com/materialsproject/maggma/pull/452) ([shyamd](https://github.com/shyamd)) ## [v0.29.0](https://github.com/materialsproject/maggma/tree/v0.29.0) (2021-06-08) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.28.1...v0.29.0) **Merged pull requests:** - Maggma API additions [\#448](https://github.com/materialsproject/maggma/pull/448) ([munrojm](https://github.com/munrojm)) ## [v0.28.1](https://github.com/materialsproject/maggma/tree/v0.28.1) (2021-06-08) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.28.0...v0.28.1) **Closed issues:** - Indescriptive error when not specifying any builders in CLI [\#446](https://github.com/materialsproject/maggma/issues/446) - Add port auto-negotiation [\#445](https://github.com/materialsproject/maggma/issues/445) **Merged pull requests:** - New release wflow [\#450](https://github.com/materialsproject/maggma/pull/450) ([shyamd](https://github.com/shyamd)) - Ensure Store.name is a property [\#449](https://github.com/materialsproject/maggma/pull/449) ([utf](https://github.com/utf)) ## [v0.28.0](https://github.com/materialsproject/maggma/tree/v0.28.0) (2021-05-26) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.27.0...v0.28.0) **Merged pull requests:** - Updates the CLI Runner [\#447](https://github.com/materialsproject/maggma/pull/447) ([shyamd](https://github.com/shyamd)) ## [v0.27.0](https://github.com/materialsproject/maggma/tree/v0.27.0) (2021-05-12) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.26.0...v0.27.0) **Closed issues:** - Python 3.6 compatability [\#336](https://github.com/materialsproject/maggma/issues/336) **Merged pull requests:** - Fix aws module import [\#435](https://github.com/materialsproject/maggma/pull/435) ([utf](https://github.com/utf)) - coverage [\#430](https://github.com/materialsproject/maggma/pull/430) ([jmmshn](https://github.com/jmmshn)) - Update AWS Bucket Detection [\#429](https://github.com/materialsproject/maggma/pull/429) ([jmmshn](https://github.com/jmmshn)) - Add Object Hash to S3Store [\#427](https://github.com/materialsproject/maggma/pull/427) ([jmmshn](https://github.com/jmmshn)) - Rebuild API module [\#423](https://github.com/materialsproject/maggma/pull/423) ([shyamd](https://github.com/shyamd)) - updated documentaion. [\#419](https://github.com/materialsproject/maggma/pull/419) ([jmmshn](https://github.com/jmmshn)) - Revert "Bump ipython from 7.16.1 to 7.21.0" [\#406](https://github.com/materialsproject/maggma/pull/406) ([shyamd](https://github.com/shyamd)) - update gridfs store [\#381](https://github.com/materialsproject/maggma/pull/381) ([gpetretto](https://github.com/gpetretto)) ## [v0.26.0](https://github.com/materialsproject/maggma/tree/v0.26.0) (2021-01-16) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.25.0...v0.26.0) **Merged pull requests:** - No Progress bars [\#382](https://github.com/materialsproject/maggma/pull/382) ([shyamd](https://github.com/shyamd)) ## [v0.25.0](https://github.com/materialsproject/maggma/tree/v0.25.0) (2020-12-04) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.24.2...v0.25.0) **Closed issues:** - FEATURE: Jupyter Commands [\#276](https://github.com/materialsproject/maggma/issues/276) **Merged pull requests:** - Python 3.6 Compatibility [\#352](https://github.com/materialsproject/maggma/pull/352) ([shyamd](https://github.com/shyamd)) - Automatically parse the dbname from the URI [\#350](https://github.com/materialsproject/maggma/pull/350) ([jmmshn](https://github.com/jmmshn)) - Setup: msgpack-python was renamed to msgpack [\#344](https://github.com/materialsproject/maggma/pull/344) ([jan-janssen](https://github.com/jan-janssen)) - Ensure MongoStore can safely continue updating when documents are too large [\#338](https://github.com/materialsproject/maggma/pull/338) ([shyamd](https://github.com/shyamd)) ## [v0.24.2](https://github.com/materialsproject/maggma/tree/v0.24.2) (2020-11-17) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.24.1...v0.24.2) **Merged pull requests:** - Fix array unwrapping in distinct [\#335](https://github.com/materialsproject/maggma/pull/335) ([shyamd](https://github.com/shyamd)) ## [v0.24.1](https://github.com/materialsproject/maggma/tree/v0.24.1) (2020-11-17) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.24.0...v0.24.1) **Closed issues:** - mrun failure with 'dict' object has no attribute 'connect' [\#316](https://github.com/materialsproject/maggma/issues/316) - FEATURE: Serialized SSH Tunnel [\#290](https://github.com/materialsproject/maggma/issues/290) **Merged pull requests:** - Fix Distinct in MongoStore [\#332](https://github.com/materialsproject/maggma/pull/332) ([shyamd](https://github.com/shyamd)) - Direct passing of AWS login to S3Store [\#326](https://github.com/materialsproject/maggma/pull/326) ([jmmshn](https://github.com/jmmshn)) - Wrap SSHTunnelForward and make it MSONable [\#320](https://github.com/materialsproject/maggma/pull/320) ([shyamd](https://github.com/shyamd)) ## [v0.24.0](https://github.com/materialsproject/maggma/tree/v0.24.0) (2020-11-02) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.23.3...v0.24.0) **Merged pull requests:** - Small fix to make sure searchable\_fields are updated [\#303](https://github.com/materialsproject/maggma/pull/303) ([jmmshn](https://github.com/jmmshn)) ## [v0.23.3](https://github.com/materialsproject/maggma/tree/v0.23.3) (2020-09-23) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.23.2...v0.23.3) ## [v0.23.2](https://github.com/materialsproject/maggma/tree/v0.23.2) (2020-09-23) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.23.1...v0.23.2) ## [v0.23.1](https://github.com/materialsproject/maggma/tree/v0.23.1) (2020-09-21) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.23.0...v0.23.1) **Closed issues:** - FEATURE: Python file runner [\#277](https://github.com/materialsproject/maggma/issues/277) ## [v0.23.0](https://github.com/materialsproject/maggma/tree/v0.23.0) (2020-09-14) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.22.3...v0.23.0) **Closed issues:** - Separate out S3 Object reference keys from search keys [\#206](https://github.com/materialsproject/maggma/issues/206) **Merged pull requests:** - Add custom source loading [\#278](https://github.com/materialsproject/maggma/pull/278) ([shyamd](https://github.com/shyamd)) - Inject metadata via fields rather than by indicies [\#265](https://github.com/materialsproject/maggma/pull/265) ([shyamd](https://github.com/shyamd)) ## [v0.22.3](https://github.com/materialsproject/maggma/tree/v0.22.3) (2020-08-26) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.22.2...v0.22.3) **Merged pull requests:** - added context manager for stores [\#258](https://github.com/materialsproject/maggma/pull/258) ([jmmshn](https://github.com/jmmshn)) ## [v0.22.2](https://github.com/materialsproject/maggma/tree/v0.22.2) (2020-08-21) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.22.1...v0.22.2) **Merged pull requests:** - Minor bug fixes to S3Store [\#253](https://github.com/materialsproject/maggma/pull/253) ([jmmshn](https://github.com/jmmshn)) ## [v0.22.1](https://github.com/materialsproject/maggma/tree/v0.22.1) (2020-08-11) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.22.0...v0.22.1) **Merged pull requests:** - accept int as sort keys instead of Sort\(\) in .query\(\) and .groupby\(\) [\#241](https://github.com/materialsproject/maggma/pull/241) ([rkingsbury](https://github.com/rkingsbury)) - Update setup.py [\#225](https://github.com/materialsproject/maggma/pull/225) ([jmmshn](https://github.com/jmmshn)) ## [v0.22.0](https://github.com/materialsproject/maggma/tree/v0.22.0) (2020-07-16) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.21.0...v0.22.0) **Merged pull requests:** - Ensure Metadata in Documents from GridFS [\#222](https://github.com/materialsproject/maggma/pull/222) ([shyamd](https://github.com/shyamd)) - Projection\_Builder tests [\#213](https://github.com/materialsproject/maggma/pull/213) ([acrutt](https://github.com/acrutt)) - \[WIP\] Proper multithreading and msgpack fix [\#205](https://github.com/materialsproject/maggma/pull/205) ([jmmshn](https://github.com/jmmshn)) - Fix projection\_builder.update\_targets\(\) [\#179](https://github.com/materialsproject/maggma/pull/179) ([acrutt](https://github.com/acrutt)) ## [v0.21.0](https://github.com/materialsproject/maggma/tree/v0.21.0) (2020-06-22) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.20.0...v0.21.0) **Merged pull requests:** - Reconstruct metadata from index in S3 Store [\#182](https://github.com/materialsproject/maggma/pull/182) ([jmmshn](https://github.com/jmmshn)) - MapBuilder retry\_failed Fix [\#180](https://github.com/materialsproject/maggma/pull/180) ([acrutt](https://github.com/acrutt)) - MapBuilder retry\_failed bug [\#111](https://github.com/materialsproject/maggma/pull/111) ([acrutt](https://github.com/acrutt)) ## [v0.20.0](https://github.com/materialsproject/maggma/tree/v0.20.0) (2020-05-02) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.19.1...v0.20.0) **Merged pull requests:** - Initial Drone Implementation [\#145](https://github.com/materialsproject/maggma/pull/145) ([wuxiaohua1011](https://github.com/wuxiaohua1011)) - parallel s3 store wrting [\#130](https://github.com/materialsproject/maggma/pull/130) ([jmmshn](https://github.com/jmmshn)) - Make GridFSStore query check files store first. [\#128](https://github.com/materialsproject/maggma/pull/128) ([munrojm](https://github.com/munrojm)) ## [v0.19.1](https://github.com/materialsproject/maggma/tree/v0.19.1) (2020-04-06) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.19.0...v0.19.1) ## [v0.19.0](https://github.com/materialsproject/maggma/tree/v0.19.0) (2020-04-06) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.18.0...v0.19.0) **Closed issues:** - ISSUE: newer\_in method incompatible with GridFSStore [\#113](https://github.com/materialsproject/maggma/issues/113) **Merged pull requests:** - Fix async [\#129](https://github.com/materialsproject/maggma/pull/129) ([shyamd](https://github.com/shyamd)) - small fixes [\#115](https://github.com/materialsproject/maggma/pull/115) ([jmmshn](https://github.com/jmmshn)) - Store updates [\#114](https://github.com/materialsproject/maggma/pull/114) ([jmmshn](https://github.com/jmmshn)) - \[WIP\] Add EndpointCluster and ClusterManager to maggma [\#66](https://github.com/materialsproject/maggma/pull/66) ([wuxiaohua1011](https://github.com/wuxiaohua1011)) ## [v0.18.0](https://github.com/materialsproject/maggma/tree/v0.18.0) (2020-03-23) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.17.3...v0.18.0) **Merged pull requests:** - Amazon S3 store update [\#110](https://github.com/materialsproject/maggma/pull/110) ([munrojm](https://github.com/munrojm)) ## [v0.17.3](https://github.com/materialsproject/maggma/tree/v0.17.3) (2020-03-18) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.17.2...v0.17.3) ## [v0.17.2](https://github.com/materialsproject/maggma/tree/v0.17.2) (2020-03-13) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.17.1...v0.17.2) ## [v0.17.1](https://github.com/materialsproject/maggma/tree/v0.17.1) (2020-03-12) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.16.1...v0.17.1) **Merged pull requests:** - Various Bug Fixes [\#109](https://github.com/materialsproject/maggma/pull/109) ([shyamd](https://github.com/shyamd)) - Addition of Projection Builder [\#99](https://github.com/materialsproject/maggma/pull/99) ([acrutt](https://github.com/acrutt)) - Fix issues with last\_updated in MapBuilder [\#98](https://github.com/materialsproject/maggma/pull/98) ([shyamd](https://github.com/shyamd)) - autonotebook for tqdm [\#97](https://github.com/materialsproject/maggma/pull/97) ([shyamd](https://github.com/shyamd)) ## [v0.16.1](https://github.com/materialsproject/maggma/tree/v0.16.1) (2020-01-28) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.16.0...v0.16.1) ## [v0.16.0](https://github.com/materialsproject/maggma/tree/v0.16.0) (2020-01-28) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.15.0...v0.16.0) **Closed issues:** - Onotology generation from builder [\#59](https://github.com/materialsproject/maggma/issues/59) **Merged pull requests:** - Add MongoURIStore [\#93](https://github.com/materialsproject/maggma/pull/93) ([shyamd](https://github.com/shyamd)) - Update distinct to be more like mongo distinct [\#92](https://github.com/materialsproject/maggma/pull/92) ([shyamd](https://github.com/shyamd)) - Add count to maggma store [\#86](https://github.com/materialsproject/maggma/pull/86) ([shyamd](https://github.com/shyamd)) ## [v0.15.0](https://github.com/materialsproject/maggma/tree/v0.15.0) (2020-01-23) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.14.1...v0.15.0) **Closed issues:** - Builder Reporting [\#78](https://github.com/materialsproject/maggma/issues/78) - ZeroMQ based multi-node processing [\#76](https://github.com/materialsproject/maggma/issues/76) - Add time limits to process\_item? \(Possibly just in MapBuilder?\) [\#45](https://github.com/materialsproject/maggma/issues/45) **Merged pull requests:** - \[WIP\] Builder Reporting [\#80](https://github.com/materialsproject/maggma/pull/80) ([shyamd](https://github.com/shyamd)) - Updated GroupBuilder [\#79](https://github.com/materialsproject/maggma/pull/79) ([shyamd](https://github.com/shyamd)) - New Distributed Processor [\#77](https://github.com/materialsproject/maggma/pull/77) ([shyamd](https://github.com/shyamd)) ## [v0.14.1](https://github.com/materialsproject/maggma/tree/v0.14.1) (2020-01-10) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.14.0...v0.14.1) ## [v0.14.0](https://github.com/materialsproject/maggma/tree/v0.14.0) (2020-01-10) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.13.0...v0.14.0) **Closed issues:** - Preserve last\_updated for MapBuilder [\#58](https://github.com/materialsproject/maggma/issues/58) - Move away from mpi4py [\#51](https://github.com/materialsproject/maggma/issues/51) - Run serial processor directly from builder [\#48](https://github.com/materialsproject/maggma/issues/48) - Update while processing [\#42](https://github.com/materialsproject/maggma/issues/42) - Running JSONStore.connect\(\) multiple times leads to undefined behavior [\#40](https://github.com/materialsproject/maggma/issues/40) - get\_criteria directly invokes mongo commands [\#38](https://github.com/materialsproject/maggma/issues/38) - Cursor timeouts common [\#35](https://github.com/materialsproject/maggma/issues/35) - Possible solution to "stalled" Runner.run ? [\#29](https://github.com/materialsproject/maggma/issues/29) **Merged pull requests:** - Release Workflow for Github [\#75](https://github.com/materialsproject/maggma/pull/75) ([shyamd](https://github.com/shyamd)) - Documentation [\#74](https://github.com/materialsproject/maggma/pull/74) ([shyamd](https://github.com/shyamd)) - Reorg code [\#69](https://github.com/materialsproject/maggma/pull/69) ([shyamd](https://github.com/shyamd)) - Updates for new monitoring services [\#67](https://github.com/materialsproject/maggma/pull/67) ([shyamd](https://github.com/shyamd)) - fix GridFSStore [\#64](https://github.com/materialsproject/maggma/pull/64) ([gpetretto](https://github.com/gpetretto)) - Massive refactoring to get ready for v1.0 [\#62](https://github.com/materialsproject/maggma/pull/62) ([shyamd](https://github.com/shyamd)) - Bug Fixes [\#61](https://github.com/materialsproject/maggma/pull/61) ([shyamd](https://github.com/shyamd)) - GridFSStore bug fix [\#60](https://github.com/materialsproject/maggma/pull/60) ([munrojm](https://github.com/munrojm)) - Fix Store serialization with @version [\#57](https://github.com/materialsproject/maggma/pull/57) ([mkhorton](https://github.com/mkhorton)) - Update builder to work with new monty [\#56](https://github.com/materialsproject/maggma/pull/56) ([mkhorton](https://github.com/mkhorton)) ## [v0.13.0](https://github.com/materialsproject/maggma/tree/v0.13.0) (2019-03-29) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.12.0...v0.13.0) **Merged pull requests:** - Add timeout to MapBuilder, store process time [\#54](https://github.com/materialsproject/maggma/pull/54) ([mkhorton](https://github.com/mkhorton)) - Can update pyyaml req? [\#50](https://github.com/materialsproject/maggma/pull/50) ([dwinston](https://github.com/dwinston)) - Concat store [\#47](https://github.com/materialsproject/maggma/pull/47) ([shyamd](https://github.com/shyamd)) ## [v0.12.0](https://github.com/materialsproject/maggma/tree/v0.12.0) (2018-11-19) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.11.0...v0.12.0) ## [v0.11.0](https://github.com/materialsproject/maggma/tree/v0.11.0) (2018-11-01) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.9.0...v0.11.0) **Merged pull requests:** - Better printing of validation erorrs [\#46](https://github.com/materialsproject/maggma/pull/46) ([mkhorton](https://github.com/mkhorton)) - Updates to JointStore and MapBuilder [\#44](https://github.com/materialsproject/maggma/pull/44) ([shyamd](https://github.com/shyamd)) ## [v0.9.0](https://github.com/materialsproject/maggma/tree/v0.9.0) (2018-10-01) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.8.0...v0.9.0) **Closed issues:** - Non-obvious error message when trying to query a Store that hasn't been connected [\#41](https://github.com/materialsproject/maggma/issues/41) - Criteria/properties order of MongoStore.query [\#37](https://github.com/materialsproject/maggma/issues/37) - tqdm in Jupyter [\#33](https://github.com/materialsproject/maggma/issues/33) - query args order [\#31](https://github.com/materialsproject/maggma/issues/31) **Merged pull requests:** - Simplification of Validator class + tests [\#39](https://github.com/materialsproject/maggma/pull/39) ([mkhorton](https://github.com/mkhorton)) - Fix for Jupyter detection for tqdm [\#36](https://github.com/materialsproject/maggma/pull/36) ([mkhorton](https://github.com/mkhorton)) - Add tqdm widget inside Jupyter [\#34](https://github.com/materialsproject/maggma/pull/34) ([mkhorton](https://github.com/mkhorton)) - Change update\_targets log level from debug to exception [\#32](https://github.com/materialsproject/maggma/pull/32) ([mkhorton](https://github.com/mkhorton)) - Jointstore [\#23](https://github.com/materialsproject/maggma/pull/23) ([montoyjh](https://github.com/montoyjh)) ## [v0.8.0](https://github.com/materialsproject/maggma/tree/v0.8.0) (2018-08-22) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.6.5...v0.8.0) **Merged pull requests:** - \[WIP\] Improve/refactor examples and move inside maggma namespace [\#30](https://github.com/materialsproject/maggma/pull/30) ([dwinston](https://github.com/dwinston)) - Fix mrun with default num\_workers. Add test. [\#28](https://github.com/materialsproject/maggma/pull/28) ([dwinston](https://github.com/dwinston)) ## [v0.6.5](https://github.com/materialsproject/maggma/tree/v0.6.5) (2018-06-07) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.6.4...v0.6.5) ## [v0.6.4](https://github.com/materialsproject/maggma/tree/v0.6.4) (2018-06-07) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.6.3...v0.6.4) ## [v0.6.3](https://github.com/materialsproject/maggma/tree/v0.6.3) (2018-06-07) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.6.2...v0.6.3) **Merged pull requests:** - Add MongograntStore [\#27](https://github.com/materialsproject/maggma/pull/27) ([dwinston](https://github.com/dwinston)) ## [v0.6.2](https://github.com/materialsproject/maggma/tree/v0.6.2) (2018-06-01) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.6.1...v0.6.2) ## [v0.6.1](https://github.com/materialsproject/maggma/tree/v0.6.1) (2018-06-01) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.6.0...v0.6.1) **Merged pull requests:** - Help user if e.g. target store built without lu\_field [\#26](https://github.com/materialsproject/maggma/pull/26) ([dwinston](https://github.com/dwinston)) ## [v0.6.0](https://github.com/materialsproject/maggma/tree/v0.6.0) (2018-05-01) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.5.0...v0.6.0) **Implemented enhancements:** - Progress Bar [\#21](https://github.com/materialsproject/maggma/issues/21) - Query Engine equivalent [\#9](https://github.com/materialsproject/maggma/issues/9) **Merged pull requests:** - Progress Bars for Multiprocess Runner [\#24](https://github.com/materialsproject/maggma/pull/24) ([shyamd](https://github.com/shyamd)) - GridFS Store update: use metadata field, update removes old file\(s\) [\#20](https://github.com/materialsproject/maggma/pull/20) ([dwinston](https://github.com/dwinston)) ## [v0.5.0](https://github.com/materialsproject/maggma/tree/v0.5.0) (2018-03-31) [Full Changelog](https://github.com/materialsproject/maggma/compare/0.4.0...v0.5.0) **Closed issues:** - Need from pymongo collection [\#18](https://github.com/materialsproject/maggma/issues/18) **Merged pull requests:** - Useability updates [\#19](https://github.com/materialsproject/maggma/pull/19) ([shyamd](https://github.com/shyamd)) ## [0.4.0](https://github.com/materialsproject/maggma/tree/0.4.0) (2018-02-28) [Full Changelog](https://github.com/materialsproject/maggma/compare/0.3.0...0.4.0) **Merged pull requests:** - New Multiprocessor and MPI Processor [\#17](https://github.com/materialsproject/maggma/pull/17) ([shyamd](https://github.com/shyamd)) - groupby change for memory/jsonstore [\#16](https://github.com/materialsproject/maggma/pull/16) ([montoyjh](https://github.com/montoyjh)) - Rename Schema to Validator [\#15](https://github.com/materialsproject/maggma/pull/15) ([mkhorton](https://github.com/mkhorton)) ## [0.3.0](https://github.com/materialsproject/maggma/tree/0.3.0) (2018-02-01) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.2.0...0.3.0) **Implemented enhancements:** - Vault enabled Store [\#8](https://github.com/materialsproject/maggma/issues/8) **Merged pull requests:** - PR for generic Schema class [\#14](https://github.com/materialsproject/maggma/pull/14) ([mkhorton](https://github.com/mkhorton)) - Issue 8 vault store [\#13](https://github.com/materialsproject/maggma/pull/13) ([shreddd](https://github.com/shreddd)) - adds grouping function and test to make aggregation-based builds [\#12](https://github.com/materialsproject/maggma/pull/12) ([montoyjh](https://github.com/montoyjh)) ## [v0.2.0](https://github.com/materialsproject/maggma/tree/v0.2.0) (2018-01-01) [Full Changelog](https://github.com/materialsproject/maggma/compare/v0.1.0...v0.2.0) **Closed issues:** - LU translation functions don't serialize [\#11](https://github.com/materialsproject/maggma/issues/11) **Merged pull requests:** - Mongolike mixin [\#10](https://github.com/materialsproject/maggma/pull/10) ([montoyjh](https://github.com/montoyjh)) ## [v0.1.0](https://github.com/materialsproject/maggma/tree/v0.1.0) (2017-11-08) [Full Changelog](https://github.com/materialsproject/maggma/compare/78ef2e8eacc051207350dc6abe886a403982aef8...v0.1.0) **Closed issues:** - ditch python 2 and support only 3? [\#3](https://github.com/materialsproject/maggma/issues/3) - Seeking clarifications [\#1](https://github.com/materialsproject/maggma/issues/1) **Merged pull requests:** - Do not wait until all items are processed to update targets [\#7](https://github.com/materialsproject/maggma/pull/7) ([dwinston](https://github.com/dwinston)) - Run builder with either MPI or multiprocessing [\#6](https://github.com/materialsproject/maggma/pull/6) ([matk86](https://github.com/matk86)) - add lava code and tool execution script [\#5](https://github.com/materialsproject/maggma/pull/5) ([gilbertozp](https://github.com/gilbertozp)) - Add eclipse project files to .gitignore [\#2](https://github.com/materialsproject/maggma/pull/2) ([gilbertozp](https://github.com/gilbertozp)) \* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)* maggma-0.70.0/docs/concepts.md000066400000000000000000000065161470132070100161220ustar00rootroot00000000000000# Concepts `maggma`'s core classes -- [`Store`](#store) and [`Builder`](#builder) -- provide building blocks for modular data pipelines. Data resides in one or more `Store` and is processed by a `Builder`. The results of the processing are saved in another `Store`, and so on: ```mermaid flowchart LR     s1(Store 1) --Builder 1--> s2(Store 2) --Builder 2--> s3(Store 3) s2 -- Builder 3-->s4(Store 4) ``` ## Store A major challenge in building scalable data pipelines is dealing with all the different types of data sources out there. Maggma's `Store` class provides a consistent, unified interface for querying data from arbitrary data sources. It was originally built around MongoDB, so it's interface closely resembles `PyMongo` syntax. However, Maggma makes it possible to use that same syntax to query other types of databases, such as Amazon S3, GridFS, or even files on disk. Stores are databases containing organized document-based data. They represent either a data source or a data sink. They are modeled around the MongoDB collection although they can represent more complex data sources that auto-alias keys without the user knowing, or even providing concatenation or joining of Stores. Stores implement methods to `connect`, `query`, find `distinct` values, `groupby` fields, `update` documents, and `remove` documents. Stores also implement a number of critical fields for Maggma that help in efficient document processing: the `key` and the `last_updated_field`. `key` is the field that is used to uniquely index the underlying data source. `last_updated_field` is the timestamp of when that document was last modified. ## Builder Builders represent a data processing step, analogous to an extract-transform-load (ETL) operation in a data warehouse model. Much like `Store`, the `Builder` class provides a consistent interface for writing data transformations, which are each broken into 3 phases: `get_items`, `process_item`, and `update_targets`: 1. `get_items`: Retrieve items from the source Store(s) for processing by the next phase 2. `process_item`: Manipulate the input item and create an output document that is sent to the next phase for storage. 3. `update_target`: Add the processed item to the target Store(s). Both `get_items` and `update_targets` can perform IO (input/output) to the data stores. `process_item` is expected to not perform any IO so that it can be parallelized by Maggma. Builders can be chained together into an array and then saved as a JSON file to be run on a production system. ## MSONable Another challenge in building complex data-transformation codes is keeping track of all the settings necessary to make some output database. One bad solution is to hard-code these settings, but then any modification is difficult to keep track of. Maggma solves this by putting the configuration with the pipeline definition in JSON or YAML files. This is done using the `MSONable` pattern, which requires that any Maggma object (the databases and transformation steps) can convert itself to a python dictionary with it's configuration parameters in a process called serialization. These dictionaries can then be converted back to the original Maggma object without having to know what class it belonged. `MSONable` does this by injecting in `@class` and `@module` keys that tell it where to find the original python code for that Maggma object. maggma-0.70.0/docs/getting_started/000077500000000000000000000000001470132070100171415ustar00rootroot00000000000000maggma-0.70.0/docs/getting_started/advanced_builder.md000066400000000000000000000040241470132070100227360ustar00rootroot00000000000000# Advanced Builder Concepts There are a number of features in `maggma` designed to assist with advanced features: ## Logging `maggma` builders have a python `logger` object that is already setup to output to the correct level. You can directly use it to output `info`, `debug`, and `error` messages. ``` python def get_items(self) -> Iterable: ... self.logger.info(f"Got {len(to_process_ids)} to process") ... ``` ## Querying for Updated Documents One of the most important features in a builder is incremental building which allows the builder to just process new documents. One of the parameters for a maggma store is the `last_updated_field` and the `last_updated_type` which tell `maggma` how to deal with dates in the source and target documents. This allows us to get the `id` of any documents that are newer in the target than the newest document in the source: ``` python new_ids = self.target.newer_in(self.source) ``` ## Speeding up Data Transfers Since `maggma` is designed around Mongo style data sources and sinks, building indexes or in-memory copies of fields you want to search on is critical to get the fastest possible data input/output (IO). Since this is very builder and document style dependent, `maggma` provides a direct interface to `ensure_indexes` on a Store. A common paradigm is to do this in the beginning of `get_items`: ``` python def ensure_indexes(self): self.source.ensure_index("some_search_fields") self.target.ensure_index(self.target.key) def get_items(self) -> Iterable: self.ensure_indexes() ... ``` ## Built in Templates for Advanced Builders `maggma` implements templates for builders that have many of these advanced features listed above: - [MapBuilder](map_builder.md) Creates one-to-one document mapping of items in the source Store to the transformed documents in the target Store. - [GroupBuilder](group_builder.md) Creates many-to-one document mapping of items in the source Store to transformed documents in the target Store maggma-0.70.0/docs/getting_started/advanced_stores.md000066400000000000000000000061011470132070100226250ustar00rootroot00000000000000# Configurations and Usage of Advanced `store`'s ## S3Store ### Configuration The S3Store interfaces with S3 object storage via [boto3](https://pypi.org/project/boto3/). For this to work properly, you have to set your basic configuration in `~/.aws/config` ```buildoutcfg [default] source_profile = default ``` Then, you have to set up your credentials in `~/.aws/credentials` ```buildoutcfg [default] aws_access_key_id = YOUR_KEY aws_secret_access_key = YOUR_SECRET ``` For more information on the configuration please see the following [documentation](https://docs.aws.amazon.com/credref/latest/refdocs/settings-global.html). Note that while these configurations are in the `~/.aws` folder, they are shared by other similar services like the self-hosted [minio](https://min.io/) service. ### Basic Usage MongoDB is not designed to handle large object storage. As such, we created an abstract object that combines the large object storage capabilities of Amazon S3 and the easy, python-friendly query language of MongoDB. These `S3Store`s all include an `index` store that only stores specific queryable data and the object key for retrieving the data from an S3 bucket using the `key` attribute (called `'fs_id'` by default). An entry of in the `index` may look something like this: ``` { fs_id : "5fc6b87e99071dfdf04ca871" task_id : "mp-12345" } ``` Please note that since we are giving users the ability to reconstruct the index store using the object metadata, the object size in the `index` is limited by the metadata and not MongoDB. Different S3 services might have different rules, but the limit is typically smaller: 8 KB for [aws](https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingMetadata.html) The `S3Store` should be constructed as follows: ```python from maggma.stores import MongoURIStore, S3Store store = MongoURIStore( "mongodb+srv://:@", "atomate_aeccar0_fs_index", key="fs_id", ) s3store = S3Store(index=index, bucket="<>", s3_profile="<>", compress= True, endpoint_url= "<>", sub_dir= "atomate_aeccar0_fs", s3_workers=4 ) ``` The `subdir` field creates subdirectories in the bucket to help the user organize their data. ### Parallelism Once you start working with large quantities of data, the speed at which you process this data will often be limited by database I/O. For the most time-consuming upload part of the process, we have implemented thread-level parallelism in the `update` member function. The `update` function received an entire chunk of processed data as defined by `chunk_size`, however since `Store.update` is typically called in the `update_targets` part of a builder, where builder execution is not longer multi-threaded. As such, we multithread the execution inside of `update` using `s3_workers` threads to perform the database write operation. As a general rule of thumb, if you notice that your update step is taking too long, you should change the `s3_worker` field which is optimized differently based on server-side resources. maggma-0.70.0/docs/getting_started/file_store_dir_structure.png000066400000000000000000001237301470132070100247660ustar00rootroot00000000000000PNG  IHDR. . kiCCPICC ProfileHWXS[H #ҋZ`#$cBP+ ]D" ,*b/be]EQTބtWwof{Μ;s }\$ _\ a3IO-Xc\LŠPڿ˻QW\x  xoIziV ̓6l/Frh܃zf!/h|Ut8'!V?E+ a<'gsX9AQ $ygjɇ|BJ#9;%Jw3cbOW"G&)Qc QC.΋V3DjA 8@X KPlNWB볤lJ+@R 8*~LH1bBQr rT6!<^qD+̒ǫKeCŶ E>P LT;]YIC<Yj\0ܱqRORS$yq*{B[@!+LPœ TYDexQwL2|l @K&r>){HA6gfhD` ȆDž @!*kg5[88"<8D<,% @޹`y(~HUÂhF>䑩9dI ##D{hXCjOxJh'<"\'tnO-~XU6 Cqnq臅AϞPVŭ ;#Q>9lH  aE͏2|{&|F}o-bg9kLֈ]Ď(z2Ɠ yD7fָv~R(6{dT-,`Ay.NL7W77-c0-4 `@W]<:N8uyy9y|IhSl)DJe>RK9EGyn>N]>OB}Y:T*:*.PoSh4Z0-V@[F=i5\48|W4^i55Y445j^"khZs*kզkҎ^GsN_Xg tnIgySN].G7GLwnn^tJ#z a07MY%W06\7h4 35\i`x7r0g4h)#GF801j`oBugڤ,iynk`˱-gG jWew͞hck tt:z979;|NUN7,B. h. .FZLr䙑_\=]\w3j̨FqspU]suot!q˓9sgg/o/WWwF>>q>K}|C|6~+;mG F8"-#5#<T(23˞z" 9Ǟn B#BKCt’6= I\ycq9=cs2!QC4i,:vcXLjcbA,'vu8۸q#W9iYg $K I\x7.IԚ$IyL̝֜|0'7[dnyx/5.A`YV@֪٫Ara- z%}nl܁|bq)ӧK%%~SNFIwDYc.<_?, ,,웖x}ia>|L'ҧD}7?0 JG 4+ 7@gx]pPuA6c<̃ܰU>\T"rwSrQM70R>e"3l Ty;QO-PDo4<eXIfMM*>F(iNx.ASCIIScreenshot<[ pHYs%%IR$iTXtXML:com.adobe.xmp 444 558 Screenshot @kiDOT(RP73@IDATxSGnx)PTnZݭ8,Rn[ -)m3rMI6Ytse;9g"Q"o @@@@$/%4@@@@p@@@k@xP   .x *4@@@!5C@;   5 \fP|@@@     %~<뜔7s#m8m{HlEjqoj2rQ4[}ѣ|XˆTZ \]=~?%eǸ/v,E ݢ-Yk7SD ߇-\&thDۡgi-sqHFjTQغ~㴋kp8_h)p Q)hH5J*rR!ZNCq >V̕kTwA_uT7gpzGtW# >CkKiYعG4QydF4 @"%f ʞ.5y~…}3LNƥ_^7n]?C2׮,_lY^zM{出-*< =zNy.m ^ȉQ)t+DϞn+KM mR ilnf"Gӱ_iH*Rc7om`>fS^G.q>f6o@%oL/^U_~e%9@&=|.Aޝ,>_j8lVU+Pb(Vh&?yJo62ՑIE߼|F6Ncu{akCJūCԠxa^iuM.7R(BYtEk7cw^ @@&p%e#~FqJT,z_]Jɳ5zpT>V4(p[RY I6KشgX?Z7e'Mqe#G5{g<#`-CǨ4<۵GkY4Ϥl"WD m+hmw$˂](+PGJ#p@^]jW:Ea~`>."ZZ-aM?3GE 5*sx@@) %}44U#ɳ̼+ULwEX,{]=hOSLS*{YN8,\=>M?,U.1`_AU(GR9iבqx27ͳ*g-/ї2QJxxYD\T۫+PK[>Tx!}#7~=IkĻ+*>D42:?p.'%=GUmP^,lXS2U6_ν̞,$r=D냎S~Vw=׏Hrx @@ \.Rժ1)x{K?D>zBu'Ϥao+{l\.*v˨LPM;4uuMy2I pY,>HF0zsY,әrwS=w3)),u-=u \HJl+U.KʕRWnKF"¨D/_}zĚWIwn(k3{.+4IHqdqrx@@ \.Rkj?uc-MR*.P*ɌB]~]=&@@i..҂>;GBtgZ4 N\Eqpvi{I|]#{4go|yۤ0[¥B4*"섥"3.\p\_%h}Ǚ <N\1T  6TH=|/֔@5F2gʙ6Dݞsy &^=~( z&b"-iTs(r4<ƹe.Ӿ'GAP׶ wsM;{pN<  `BŤt7C"5)ULڒ;ˑأccְ hB_bWV]ܯyiØ A?M.k%Mhh]f/ؘV*Kv%\/iݙ6Fq*U$/wkl ^NJp^^w>r6 q:&v<PRREwԱy~bo-YU[nKdPpp1 AۖX+ߎFLٲf9[y-'>'T,- yg,^I2C.Dit֮nml ϙyqxhm'sVds:T61S! S$WDO#1Ɯ֢! ݔ]6F280>-H}Fwl)뿛TS6Y4ض̉oP,G6=BiSv[yEyg}z<9{)\uRhZA5f3===#GkɆZa)Z sxhm'V8U'2T"J'oQp_)`bB gM{®N˟A^sLVw6њ=Ib.; Ea# ^R;[|HvӎXO xׯЉKCUk9u \ N:cH02rS/!: K8 1r69[7=^4Zh+x& /4*QV=AB@@ @x   >A@@<   pH@@@ @x   >A@@<   pH@@@ D66   >JGo$⍣6pсGA@@ @x㨡    \|tmF.8jh3(xt@@7   >JGo$⍣6pсGA@@ @DШ*'ʗ[1r5-GSݢ9+X䆫u* ъm;imnE| \"htGt@%Ri҆<|A5[fI8~e*- |M]\%u]"qL'8_p ` ]/SRFT׮'M@OOypuhE {=*ע~ A ¥Tohxuhʂe֮T* ^*Zy{}!IOijX7A@s@DX@DhQ ukX"󌋤PR$>pq$@x+&})fCRƫW*G=92%,9eLN\Dwoܴ̤)QF6r=| z`^nhhK+kfz>s^[ǞkϚ_7.eOƊEn^an\&]dt94[fMD>Vn?DMK)j(.x.}Y2gӮGBPL ZOt vHG-_ϕnR3ELIhVӤ׹ր,˽.Ny'OQ#f|z:  K4YW@ѣEӟPiџץz3VKW˻NqbӍ;wBˎYH<ԻIPK7v4xʌP4QT70+MEr}AѢFr&1:xD=tN   y"\d_wRqFT.Uz}i1|,˿ƝMFiRY}\L1pЯzqkܫo\}*vM~+,L[4Qt49twj?dŖp -"`. F[TLIl5i3(6Ϻݺw6oo5iZn&b$: uorUԬK)K&'M~?.[fd H  y"\Y2xH1׼#b"k/ɈheQ$={~w68Le4 ˗TNSu,OO%T'^;vk7ro,?'fLuo⒕4:u,l gۢW q ܳjr_e尊/x$=} k#2L񯿤&U*k[3\ck%5Ry$}ڸ ]u-TȫϠ,ڸYpǼ۽gR v1pH  ދp3g);Q S~噖q߫NSMY Z7~z5ϪkM[?ұ͋nn"%[&ȑ#љKS;Y2I=:+Ϟf}w %CtoސJr͛7o˸ c2 ­:]Ғ=6.քqg/?E+V}f ge'wčTWTcwֻnY5 3;p1S0MLӔ(f6g^-H627n\.xV/H%a$L@~`dx&f}Y,dL6hhNKքKxᢵgUMj@tEЅ= f٪GVAKVٷM3Z:mƢ?ԱQl;|z=N׃ :/סG(epH@<{.$Ct4&bҲQi ̴wtf9_bZwLjYF& {$~1-_ZuFJ4jkM-n.A@$q%"0'户'VUrQ.}꿦1aL۱$7rdJhPٶH},Wk.TXusr6 dp86^2͂N#۫Fh\F[ WGiMݡ7W?3r`;4 6G'QdhO9ҧ  Ҫ6ZQ GK3ȕ9 .ߙsS&! 1`shɧ 9mp[,bMC}|d.\d+mǪ"9d1o9<*T,_V:mt  &.2G6C,^G]~ހO%^ K) y~ ~kprn ?#'R"%-{Q*Sf.٣Ipk;R8@QX+?lB99֞<35-d"Z}դ/M4̓Ql&~%5q Z͋Qxwhٱ Ňp@@ p0hH/\jƍ,Zqp6 w_Kt-R ?=k&yϼCqG@4v6=78E,/8~l99OΙ6Ed/WxG귿   E \hTu. @A@@@x`    \|pBSA@@ @7/"Epo  ^Dŋ M_'@@    N׿?x/,4@@|".^4Xh*:_   E \hTu. @A@@@x`    \|pBSA@@ @7/"Epo@rR|޾%s^fFtP%U/[MOn l4m ʐ2NZA@$%:k*/jm&mGmy4S[JiRUnteUmjWFAU*P5}<^H  !n/ŝ/̙2~=(vz?BϽ4քKJKZ*F  .R"nߠ5X"EdO RKE)m4ji" \g^pp7.-O}{$A|ucz`UX}7@@K /g|K{/萔UQ/%)Ӊv4E2FOWZ}nbg[-H~l@*o aߺKkn.ѢQtep*ukLWW#dL&J@@UϹL0_'Ϟ&%uſOW:K&al8԰U+S h@PEBcwBٰ=~w(b$MBvW%,9O 5dE7h3稩un\ϭ¥Rԭ^-[i㘉t9;r!`%M?~L%6l5ŋGw>RMj+_ӶԵw]b&Kr _؞0q<}FPĉ,r)h^*j g{!->!;2Y.XˊO Nd˚&u@ѣFuJT.Uz4Cb4bX-`ҢC)sTV3ˋ5o(n{u1ypj+vS!O[%"Gے:mjʛ- -߰EÝe`T7z]o{ȬynN晓(qx$Bp!f)ō Cݞr/u/*"Fq?SU퇃؉#p쥿wx-Q{.: U^'\'Yddґb#3/Kލ=~ah`º1G^ng+~w68LY &ILݓ_4Ugt4QBu~VEݠE\=䙀uspq-z!n.y9zJQbU dxw3+.R ;LkOX U.ZrfΨ{z RI*ԦFg.sGSDN+t's 4{'>RxaK!wy1t߫֩q'g)h*LkMZ?ұ͋~gQev:Dg.MdƐ&褮,AքmK~w.RJEThƪ4cwlkSZ=5.lv2P\_z[}vY:ޓh=9¹Dk+7oQe/E ^iJN3zt3/e_t[fZU;204-kևm3Z7#q231w ҅ gQR1p5ʶK晅vGXzk.NN퇌 U>о3(V c^mǪAj0e;-3¥ e U^-\Wq*r/{KFٮ ïU26nL߃}(ֺ3=uFn'ILYF& 9$~nkføT䪶H+%\iuҷGhCZ*rp5w->lNZ.Yُ˗^ly-٩!PLՙEl\rldzp   Jix3x)Y0? h٘n߻>cz#*DyawQAvk^U."ԮT=jl{rŦwO^JtՊvͨJ`<rg@wm/~&N5k@Up<5jE@@NpʆwzulG"3/ 0USj-ئDv6MᗏqwZWūWh$+\nç;T#1k:aѦ!96ДK..lt_ˆF-KCBik ڴ>oXJBhT  %$.'d-MY8}[cq;SdaItq̄P/sQ8q.ζE5쏻L)W%d.tV%iE~mTDzY*2+YEsp11%\S*ebxǁ\j}[5^|5XH,>7DiÁCj b ,am֋hl[L 9HbޔY%PX=i guf+ZT:h gAam\8J;do%4h8?i\}R5pQW^7T P-E,GRT=VoNFcNK.F![/bpq-xK FȩyI/.M꩙is tFxu%dܷ=]ϋ $[='L#*$y3.B @OIƗn#}:P}!*KF2.FʜW(CNGK^F"iV]¥ST|U#p=p"qpY i7_d|%mϞޓ[]Ɨ x7n, tj&/zTJ5#Jb5V㴈g$m6̖ >I:ԮҊ:Hs6*m9xuW.:C \e%3SR}.pk1CLb6Zݍ]iKXKFXMx/hףn_O6.q\5#p&<3.x *4@@@!5C@;   5 \fP|@@@     pBCA@@ \.^3Th(    ^Ck pw@@@k@xP   .x *4@@@7KH(ITJ7 ߄D2Ĥ_훯, o ,&ShL8nuy q8Ȝoh[4q-Pp@@"O ,ZTNjB=M'aP <Ŏ6:BT5ԩT*+D+*'ʗ[1uhg~_Ѣrܔ~_Y>?n}zUEEDt\rdҐVMW} z~{npY?-#%M]};4[tKg$dɣS:IXƍ/bsL;L>--Ftjg m[(];P|q&mG ٩Q 5M*#Uo΃4p\wӵX4s[ Вugۛ7q"5m}hiN7|}!jΣ?Sh#.2tln`ە0RhLzܵ4\;Nxq2Kkicd%OOypR8o?YYH˵^t_:wɛ^zM}%2gʞ16k^\Cw ݿ0+ňlےDOXƈ-ʦF뷔)׌/-Te=x7kjɧvrTHRz=mƥZْԧ)/!S,N~>M,&^V#-ҷGo/ ğ&-e%/ͤ[B\`>%\Lr-Y& #NLˢrJ6=4g@oyC4`&\Pr!40 o߭h&_~8o_. A?.+'db(5zK^9iG3|=;{2fiSWH1nƾҦҙM^rybcKѹ]"uiPy`K?rdJ:%coE1uA6͝=lw䟥)Sй. &w#e|j_s`g@1vV^;VVNKq){T7V,:mKGۙ4E2J<=}e֘:x. F,ic+u]g92ŤEC׺O_zB{^ [oE e4t,u7Nlq.UhQ&,"CҞ2LFd4yR9}"׬V.^SoݻOelj+ayg\T`{#6CGNv#hi-C`hIy2`v'r>y0>TbۡnjQX1Gy㘉t_ZLMWfU+qI)[g6֌s[թN *磙4ҕk2)'NkR*^Up0ʕ5 [wиuַ[*KpЯ&ٝ"P&)qx&|6?Lك<~T$&|@ِ >t>#\ŋB{gfH*}1Y89Y(V$6.[nsL9/gޖpA$eވ ^"6_qUcϙॗk7SQbR>T$?|&^XM_F*ˈ8l8J *ڦ.bi/ i ◁c(H #_GQ/p]xsƪ4c*&m[Ps2{ZrHJ"C;䱲,m_3.VH.k=sg҄JnBB>BIF"gw-H7 1O"̘.oe}3evzL?&PRPK0 geςj)mҼofLZ!#?uc?-J4_}v\1Wx|Bfqk61*Lw2-ܲbgˌLpm[xvwSUZyĜPl)oA]*$]j1Ѣ<dvb+i"{,GKՒYc7 (*!=i+\Tٰ\-y|BfiS>=xY?G`,] ZfKe7V7W(莳C-=kxMRx_+s,;{G*ku4ͶMa%kũqa{ .N@MX}AQ%-ॠ cc?CP9z4o^!.7 |Fm)/?Vu?{ۄ0WRͳ8ur³ǖmi5T{Gnm)f-?gGg¶P1xף2՗ DS>Hcz@,dC٩+׸d cgyl+[TH2 Ɓ,wlv-U*P5U@w!ꆕ yqO]HmQq4mIFiq&Fe ӱa\ \B%^w bВa-7…_A,\$SR4v;/Kd)$>؏h֦kܫhI5RQ;u!C:Zʆ4;rL4$dDŽ]WA+[\;wbsm) iٝ4a VKFCK2 xa9R1**%\f*,[/IT]l7k*!n.H..&m#YV(_i˔q(^~67ڮGd|=)_UdIgF3>(]&Unە沐 l=pO(CC`É)2o8sI[ u4m 1³-KHM*'U\9Un-…i4ݳ2Mν﷓m%Z~W4\+SʞZqz˳VS%& B7Y{KSxoG$m`x̤'/ڂse/swC*[XJgץz˨ G(7ݸ[q{v@ ufCp48R1[g5p r c| @ۺ!z ɛjs<EspqZIieR]nJՊC2&\twtU,*,`W:5\|J4Z~>9 n6g] /8,C҄%Wiwp}Epm=)5z -kߺ- ҋX^~UVioW_TKFUd}VVr 5Wjq̅.VIvF"ƬxZF4Hnm  Ǹc?!\L'>J EDZ}p[[2Ȝ`nEoXؘq;$fp(I"~ԋ<_v"=Zy):;-:giC,*kY+Xbs =/sLX{h䌹wM٪p'1."T{2_"^4&1!KHy+$cҒ7$I^pɸ$V@:˛H -2.c(;1b&V(+K{9{|v y Y*0[ϴa7lv&3byJRgH7M:s)83r/ӧQK2!S3Xէd4&Y]mO,)=~3ZvE4+4*0ăȰ/yuoOF!`k|b@ȹ<3m*ƂmZyA pA/-_;u3<,,+jPKy|5v{Gx%mMGQԎ+քKXs߸TsP< !^Qj6H6vs6MYG3gFjIgb; }{y1ؙxB>wqwQñW$RS^ @c@xPRRv v6m]no3I,Jx8Z^xk{ʐ34 x4@@@[tp@@@@xp1   @آ{   EţE(.5h -.Gp@c@@@lpE@@@<G    `-:   Q \2uJݨSÑ4cH?*K&_OPacyyA@@.^-\;26yr` q;j2KoꜳnMY̡^Qi=   `{(@ @.x.Q07n61޽ƷϟѢR{t_Do_ej &rH˅Kmjܑ"ZDt޸.LW {s; KrŎEAT_8bظ|:@{vDۏt3 gq)cL}?;".ԵsʍLˉQ dwSu^2jfZK3N~V/_z64v"?&HMWl\nPU)Ap~n%+    \D /^ŁgX>|HqbR=:0^"Y;@&BN RᖝѨMc/?mik,K:s.ٺ5k@ʔԲ \.ZڕQ,v4E2?R_r򹵴l=s۸  -\O em&+Ҍ‚o#\bdveͮtgJ-H˖fs9u7:eՅv6ǘYg :N%rx,$Kݵpdb1Q/Iܦ3\5Hzfy-/ܾ6*STio^mϬzqy_cdpY{/ 2ˤm26"s:غdp  `N=ƹ6;9b+0ݑs(J(ʀh Y2Syi,h\u~v>Y9p`' v. X!D{˕nR]-b?% "-e2g 6Ib[^sˏ;ߴ&cRMMF Q \dp   `N Rd'O^$}.6* xpyOnbRy^*eah"C϶"ɸ9\̸h   ,'ȹ[HjH_ պI=:vSfJsV?TSo&1k{*/cK,޴.TcW4@@.aX:qk,ӧT9/tw,ZxQ`kFF\Ϟ%p1F5v W I4yZ"n \4p'.z89<#BH#oݻO2KT%PGҕIKNqbg/ETQZu#̅14,Um:p6sk,OTxu|Uѩ:?.: W $Ѣ"٧On59|    \*$..c{SU&6v^)   Z 5^jbCM[Tg )R'ifk3z,{5%^E>2W  IDK1Kۆ[*וkԺ$qU'e> ו,| ;tޅksd CKJ6uK"se❙<9N>r;+WTl2-;CՁ(2G=o?s,$2x_/^xQRl(E\apE>C g\(e?IGOBӊ1D)>{*0T&-8 Exff_ʯ9ߋӉjREgp3eJ9-bF&N3H0L;~ _\LGUKeO*$45JA@C ~2M1"“}(*?}Y2̕S=e=ˇPDI\x6&orsdֽ"\dxE[ "=q*y5z(m?F&"XpkJ(tAq3*(Uʔ2oEVhkA4}k*^ 9 RMmR;n-\t?ft#05@6*fUN&"ysm`e0Pk>viUi=d;wAvx*ai]o+&B,D| :ˠLi!r6F<[Y1k")krظ($  KK쭜iӤi![c4Ȓyr@ػ4O-4֎pYe.\&ƮaKJ1sӸG_Ddϲ`ʐ6-=bNDң3}UA\RQreh*)"s㢝e2qh@ҶICi\ִ87 uFJWjI=%rF5TSe4%a87sism?&"ϧtiXeVuj1?Nƺ2[&2, ##@@ i 8p)c&n^UqKۡ|^TfO!ɸ%"|Tl֨ViD+\=K…KJ^9ykfwT57*NjI jC 3CV%"Ck3ř|8v6",grR  vBK\wu@~MAB~ڳO:+ήW"y;xgnATXQُ,ZF/SP|yɷm U )uTSght5ջVBBlbsؑ[aUjh;jzrٞuJ ('ZwR}; clܰJjPYtZ{hm5VZhcҥES&_q_>!\l~+Ff@@ 8pO=pQ7g$T:q3զx)rgSBOdָVlpnVsߔJ1o1ӅT?(J׫-W[ԌhPE&rT?2.mXG O  LK"};3<ECOy6v=N``Eyg^B{ذWNPd\`#ր]gǜpd'nt粥 #)X6ܫ6زiXLiI߹שU[˸XG!fk.A5=˨lD914~-LyHc"Ė>o  A%!ƁŇ-z_Ή+WU#}-sE _ř8&N4Ԑ{(gNL8XeHټCL7geqs# Jb A;SQ(yw*.%xP*^r6EA2s b(}t9►,n^3A $8G .I-@@M%@@ a@$ GjŮH@. ^jք|Z"Ho5  BYF  .@  BYF  .@  BYF  .@  BYF  .@  BYF  .@  BYF  .@  BYF  .@  BYF  .@  BYF  .@  BYF  .@%ٳtI|YׯilbD۴E}}B6Hmݨ>5QmI߅nɴi=K @BpI(hg|?o|Yӫ|2?˜<}FUw޴}Vz|^(s PJ)oNwJ& >޼sNz!6 %gu[!?mc'(sh)2ބv| :"L~C>54D"H/Fgqj% M]aZ&VQj1KEl RJ[Gޯ.P<ɳXJ2̻ujORtxC%~ .:IV/ɒыQARj+DO,\AH/\ҥoW࿐6Վ//ݩGt%6s|!`#çRԿ5t xEwn4LB UϜ=/Gό)ܨxrKΰs{blHagΙR M-/J3ij2}2_>V$$D. o.]ìzʦujQv-iC07Ī6d)(p@;~_J7CN2)bN rP2M,˽ v SXїk>qzt;|Ry3n.ҥESظNJ#R"?s_tmm:q$(x2y!=>5Ǹ(UI^6Sݼ0Mq_Qf_x]TL)uDd#A3[]KDt~|Q"/}#A3و$"D.Eޔ/A~ZCޕ5 kmŤ1T(jBLAWˈm3&B+t7i o`.bM_Q~ }/lSA)sI=zZĒpyW_ YFJeն#!?n+K)=Ϻ}vmPL4gp1hP/vLfMf/e2^t9]Bq =۵*SxFf% $ D.1."^~^MZyK2BܸvjjZg_PTha_<%^|I[wRq4 [Vy} n.r65Vr 7ȸ$\"JR/)+-f9nB{~ rX*O?*mxrPA^٧դLz("gEǏeܜpbcmJ3O"g wnܽG_VHu*SgPVnrC+\c^ZʳB{`Ƕ}4$ʾ3p-+Gu{b @@ I$pOZP~5Ҥ'^׬JtgZǭV<ҐgGܼei)/ۼ0j^mؾE/3'S쥫K, iB)/fɱS!\sp˶.tڼ\k;e&b ZflŜpl>xM-݂t9#f|ʶU a`[=j`WJuwPKy&H<wUA> $pT@>5ЇҦNmqn4S{'fZN7ㅤ- i.ۘKy)8dȚvLe3lٽϸLb);S9B(5լ hTf`AQ7!>eϲ`9[ToZiWң3}U̢S2.% =cU,YL^%TՂH-×ey o>@RHR"84&ԲQm -{i;pfhABXPdϔ3ReSyif …g*>ᙎXe|_\5nRE$X-\x&+bkH7̞bR%+e\+\Ŕ,~jtwY Mjפ]EĬ\A嫖 @@ $pϘ/_)^ҧMc ^>\wHؐ3ϴ$mb*^HĒC\ʟ3+.鋭KИ]( d]SϚPE$ K6v8u 92.u]@"hv&Ff֕W.?}{Ryi mA@]y9|)=-f^ADwؘ+8;l7]Y4qwqD2eR"l&^FEӋ$|?".Va "A[͵A/^ɠ[5,\&.—̓]p)Y(-vН{8HBv#\do5eE81܊K/*">Zr3xf@p/p12^cw;?MmݏvM~NH.BY*:3TMx:  `?J,l32J?]O.C/Qd׮r6 bYbmxQrt- bw" hf k"?,q~]5x $^ApIۼvonmlc.JovX.+7o)!o,+Uo|8ƹ Tp rkhŏn@@N؝pI .-ArQ)⺡ORA>X2v8(Oͣm{ȸ5+\틸b#w˭>ٝ E#\Fl3Ju:؝U SosEz :mZGDnF⥿3'>xfB`p\.Vd(~ڳJK *WO#uּaF 'g#>vLԮA*v:J+\Dyk">ʱM##id":ł?ohz( ,c'rwYٛ`P @@ pNe 6]G͢|> RYh"ډs_؎:}a%,e^"(&MWk8Ic"vVE!!]r]EٟO7~czkSjZ.\piM(f n_]7_%O*,^}\򮣌T[?.cy ur t`kO5}MmRMShԇG 31eS"|}E!VnF݀1TJ1E%ٹҟ<35YE9.gW[[yI,!%:[Eco#1nԵ8j{Sl ABEC.~sm.=7h)لc.Dz/Q5}7zŋW'T{oeئG,\d7ӳC%xP*>Jq#"nSp .4X*:WA@@@8`    \\"@pq_@@    Nx~p .4X*:WA@@@8`    \\"@pq_@@    Nx~p .4X*:WA@@@8`    \\"@pq_@@    NKS֌nV;w)q<9ٕ&,]I'N6q;ӫWo|zr'{B<=ph275YjcB\Mۦ06‚èT;A>c'9#y@@쀀C +Yqb2ZiknTk)9pə+'V>SO)G\BTpAYegTGYE'E!Tԅe3KEJл hֲ5qT2*Y@(   z@e+\\@@8pIɆnZx>n3S%ӝD_er &ydۅKmjl[ps SwlHh@@ 8pcHJءK—̓mn>xMJb[4(hC2eREۏ< tml:ϗ x|lP\w]r ZO K_ f/.d&oܾ|:*$^4ݬ~m1F'p?.fɑ uv2ԣW^#8)M4owfG.VBf˚=_J82R Hʐ6+yo!lM"qK^ o. Rn>c5f4OKW7yJ[E31u'ܞZ\);Ǎ4{X գ~Z,v!GҦidҍͅ5[wФKe#@@[/Z W$NK3#\fʏ)iR2w͡ߔPRT.J=1a>~jWL,D3/M{PE- E^0H "zɾ۾BnMSEek8\gt{\qI1+\~ܽZ`mxIq<}FU2XNJ!7]K[+\iUWhgc{4 Ho0 Eό]X9G-;5iTrL;ÆmZjH:wD*z~F3ZaE,<ݨvMz uQ   `L\^[9WpE"E isQX2lAշq^ _X^zZ, \y-Ž떱;`"1@@ +\.7C{30 mә̄=˂7- #*mR \~عΉm>Zp4 卅n'jrϚf~Q . |ؒC *:"o!\D'>xHbbHq_ժFYnܹc;k\rː^^XCʥwf5X|0ma'qagQ4{apNԴf5x# qᢢ@@@Zd~?;ef?[l9[ { f xƳ0U:pFr4ɷ~>H(yŵZk/bk/.* D@@lH XnZs/hԙj\.5;3vɯȡU!*eJ2PдUԏޓv^g*49nnpp8}zan&@Qݼd| 9o*UxC|D͹B!Ȯ'[dpOq!!pLƾ&VHpZ.v4Za- 嗶).\<+*_0 {s/ ̺>cFJ%=j|5%=3L$D&(  .@ŎŎ]K.v4,.v4 ]pa>%3;{/YiG=CW@@@>@8   :@耄"   A>AE$.1.: }pq@/@@@tp E@@@}z   H(   ` \c  \t@B b^ @@@@P@@@>@8   :@耄"   A>AKS֌n:t+wRMg"@@@8pߛ׬j5ԱBhOVGE%ehoaJVoju8cL8IKT0{_{ceS)A*򑹢HH"-\Y6|Rه\$y;`3r=y/U/lo0E\d@@N@@ĥ. XC#t-\Ra[iƷϟ[,cw*;yN_Ji5׀U Y2ʟ/}-+xaU3әnnT<s::3 wn&zL%Cďj@bph8; v;dR4?_X%ga 7Cџg&ѵK(?|45>VHP^?N>&S>kƏ4YV$f1~2q*?/EGGSv]iwwU3yOOhl{@IRm?F"*xW;z/s&R/h4jVAۋTew#z[i>yѝLe! @@ @*~‹bC) ($f8|(Ň"J" s.O|0}{Θ#W"3(r0m>AUQŌȫѳ/=@)n72łk-]S"FҍxOEQ)GRy+BEe\|X+\?f[S1Y@"nʶϜ=oߐqD@@tpl1цQ1RU4(wZ6aɛ[^n;|,넁Z1H+\ĬJ!ܹ EƳV +MJ}s]1i bq%GhYe.HKG 1ʊY!N9X+\E!o_-\̸XbolL& ȷM +IMG̓3ޥq~jv܄˪-(p2M7Q6v [2W戙=el?F&"{SiۧTv*&UeNS2b@@8pcki;}v"TևsmBgY2)/f[ K9-/ ([L$ ^+Rغꍌ#,XYp 5dq y]|[ƗmBӗq NE YdgA"wh/cp,VD&Kl!V(F VT2RWo3z)v%nA/Qd21='x ᢥ88'.>LꑉV^s}g.Riބ.1 'Āp-\h;,-m7rdbvrpwxaQZʌ9W팋-󃤟sKYY :+CSM-,ls F  ` \MO沦5ƹaN836URN)/- 7" .+7o)!o 4ƹlHck}0ٷ|>K*\o迵VHq22aIA_U  IKK\OHa7q_گ([t;H0{ gO- ქbFJ#Z"!^.\RcU_3QYvWgNdVZ:m/Z+I-(`,\Ʊ}Mma?ܕ딢-\⺫:kGJu@Ӟ}]qv߸zɳ/[?trŊ~g22xtJKm[E0%\\HS:C3]Eg; {䤯jVSEQTד{ćW'`@y?Lպ]{}`U*P:ھC7o찮qJԢG3 .-R5W b_12  IFxA>'։6Kٔ<:}"uf/bv;Rלk\P|٘.AQ2mݿ^muإfTDZ/ 4|rpm3':jHd-\U۱1/X,݈?xϳq{?t /3T?愋';qܧ;-epqLAy^YmĖKâlGdJN]N ZŲ 8ό61(F0[saOYFe#ʉShaKF" &di݌<>x+ \. 1|N.>lуbgtN\J"k/RP.g*1qF{̳g"fk) 0cO_QgI[ ߎmGN 1s8P @$pIRRRم={Ziը5T~v4Br qwVѩg~"b@2+רu! Zc+˂-SFzEZyY;T%¥x"hMW.KҞC$)~@&%%溥KkXY lUgB…}EY3+/E%y%ӝ8Qƪ?-AO=cOžeT(mȕ[ ˙JY'ݩGN_J䩩jZB ?l} ~C#.3.9>P?|,EcGv(# p9I*\z.6nL0BThw1][tԠ[λxMۛm>V ~ܼf.[m(|8|a#X*@:{̲(N+߲ڼ9V®6:~yga|bbwhÍg_j0^3~zm9Bo^{5kBjDJϼw-yjm87K` ozU!*(/^;i%reoOWվd. <#tۥZǒpIFx\q#^\vBf:Ӡjl#^y7T ^j!a_o?mR'<$?< l2.XD5ҥId3Cz59*p\,qm@ܫi,H.2mmfPmA:3T'7%v["O C }YRhv)7]4pvI*\ a\j1zby*Y0yj&o-^,n8 ~,Z>ȖUuM=w… вQCUvy~汼>~W/[@ s]u;vxͿJTRyUmKo~?٬#ɼ@\@ $$%ūWԣWLl+ -G˝LǛ^]SnڣMsjP^/3)e ! / rv@K|/0MɆ2.fUrX6aɛ[[d\w'PnͶ]˘][θL/3'^OFg/]^bIH1ky(M.l\ p@ $p3biP1c 2oH!2m;Y8f,T@.dc^l0rKW_,D<#jKQ XM.R 3KLO]& LexVO&.$S~y \LCKHR8hs-56'"_dL?yGEbtNHH;EHkÙ77޹heOK9UھP%Y]X.hgQkRejl4M>kjrV 1eYiSݾi#^L$\" 퀲3ReS g \pE.b[iϛMӝmf.Ѫqe qd-]LQ/I2lYp13HpiI+\} ~#S/[=C ""KKEM|Ď !#Mm_ZSYl@!.$C̙smXa.@\ z,q;!cS n+m;t2{$c*\#e#ƹmy7J!ƹoDPsA-q#R}7Le?m}l|pYcCcJ+Ŕ3$HRӡ03IDAT_%bƥ5,֘T)|@Ȥ.EL\*\r;د~zC o[{D.~#˕7/hԙ및=6.)VT0GДpQ{DF.8$.qUuD 5ABj;LSw v}|| @6|Dbwƹz|inר:Z/4̏*(&SO{It٭cRw lFjVhN|(&3nN:s*J c3pfzi)o}O&s;}܄yf5oX:M‹vWPAjנ}{F0%\M66^6R'.^$opBI*\2xG8=EY&…m*1\/惆Uv13.-`t<3b_[_ZI׺ՖZ׋ٖ,g7*J}θ9Wx5 Phy )m7"7Ei J IKRB:7 Z->ƵOVܿ9Z}vNEOyY8DOK/?{> e eV*ܔ˚ }zjO)fCk* /SRg,ԉ@^E>R y֌}<ώ:d=d=ݶgg8iKn#hܮ|hethKCb6^("lؽF qe5;"'J+6"]S3L8ܬ\זu>~pD=K]"@ DDEY)>( Arf6,d抗bgdlvdcck3CQ^ɚɍnAl1qCOyFĕŎAŘ Q4>Lg[EE9޹uS:^6;v2Eaq o؝@$Eⷀ .@% Y֧gM7YSTᩩ|KiZ<IE'([+ #Okì5iw?Z[dgL39L+Q]e  pZB$/Ibmħ%% Uxo^&e[ w[Su@@@@]P@@@@(   @„B   @F}EE&.0 ..0=pQ@@@@tpх @@@=   .L(   ` \a]2uilqIENDB`maggma-0.70.0/docs/getting_started/group_builder.md000066400000000000000000000072111470132070100223260ustar00rootroot00000000000000# Group Builder Another advanced template in `maggma` is the `GroupBuilder`, which groups documents together before applying your function on the group of items. Just like `MapBuilder`, `GroupBuilder` also handles incremental building, keeping track of errors, getting only the data you need, and managing timeouts. GroupBuilder won't delete orphaned documents since that reverse relationship isn't valid. Let's create a simple `ResupplyBuilder`, which will look at the inventory of items and determine what items need resupply. The source document will look something like this: ``` JSON { "name": "Banana", "type": "fruit", "quantity": 20, "minimum": 10, "last_updated": "2019-11-3T19:09:45" } ``` Our builder should give us documents that look like this: ``` JSON { "names": ["Grapes", "Apples", "Bananas"], "type": "fruit", "resupply": { "Apples": 10, "Bananes": 0, "Grapes": 5 }, "last_updated": "2019-11-3T19:09:45" } ``` To begin, we define our `GroupBuilder`: ``` python from maggma.builders import GroupBuilder from maggma.core import Store class ResupplyBuilder(GroupBuilder): """ Simple builder that determines which items to resupply """ def __init__(inventory: Store, resupply: Store,resupply_percent : int = 100, **kwargs): """ Arguments: inventory: current inventory information resupply: target resupply information resupply_percent: the percent of the minimum to include in the resupply """ self.inventory = inventory self.resupply = resupply self.resupply_percent = resupply_percent self.kwargs = kwargs super().__init__(source=inventory, target=resupply, grouping_properties=["type"], **kwargs) ``` Note that unlike the previous `MapBuilder` example, we didn't call the source and target stores as such. Providing more useful names is a good idea in writing builders to make it clearer what the underlying data should look like. `GroupBuilder` inherits from `MapBuilder` so it has the same configurational parameters. - query: A query to apply to items in the source Store. - projection: list of the fields you want to project. This can reduce the data transfer load if you only need certain fields or sub-documents from the source documents - timeout: optional timeout on the process function - store_process_timeout: adds the process time into the target document for profiling - retry_failed: retries running the process function on previously failed documents One parameter that doesn't work in `GroupBuilder` is `delete_orphans`, since the Many-to-One relationship makes determining orphaned documents very difficult. Finally let's get to the hard part which is running our function. We do this by defining `unary_function` ``` python def unary_function(self, items: List[Dict]) -> Dict: resupply = {} for item in items: if item["quantity"] > item["minimum"]: resupply[item["name"]] = int(item["minimum"] * self.resupply_percent ) else: resupply[item["name"]] = 0 return {"resupply": resupply} ``` Just as in `MapBuilder`, we're not returning all the extra information typically kept in the originally item. Normally, we would have to write code that copies over the source `key` and convert it to the target `key`. Same goes for the `last_updated_field`. `GroupBuilder` takes care of this, while also recording errors, processing time, and the Builder version.`GroupBuilder` also keeps a plural version of the `source.key` field, so in this example, all the `name` values will be put together and kept in `names` maggma-0.70.0/docs/getting_started/map_builder.md000066400000000000000000000057141470132070100217550ustar00rootroot00000000000000# Map Builder `maggma` has a built in builder called the `MapBuilder` which handles a number of tedious tasks in writing a builder. This class is designed to be used similar to a map operator in any other framework in even the map function in python. `MapBuilder` will take each document in the source store, apply the function you give it, and then store that in the target store. It handles incremental building, keeping track of errors, getting only the data you need, managing timeouts, and deleting orphaned documents through configurational options. Let's create the same `MultiplierBuilder` we wrote earlier using `MapBuilder`: ``` python from maggma.builders import MapBuilder from maggma.core import Store class MultiplyBuilder(MapBuilder): """ Simple builder that multiplies the "a" sub-document by pre-set value """ ``` Just like before we define a new class, but this time it should inherit from `MapBuilder`. ``` python def __init__(self, source: Store, target: Store, multiplier: int = 2, **kwargs): """ Arguments: source: the source store target: the target store multiplier: the multiplier to apply to "a" sub-document """ self.source = source self.target = target self.multiplier = multiplier self.kwargs = kwargs kwargs = {k,v in kwargs.items() if k not in ["projection","delete_orphans","timeout","store_process_time","retry_failed"]} super().__init__(source=source, target=target, projection=["a"], delete_orphans=False, timeout=10, store_process_time=True, retry_failed=True, **kwargs) ``` MapBuilder has a number of configurational options that you can hardcode as above or expose as properties for the user through **kwargs: - projection: list of the fields you want to project. This can reduce the data transfer load if you only need certain fields or sub-documents from the source documents - delete_orphans: this will delete documents in the target which don't have a corresponding document in the source - timeout: optional timeout on the process function - store_process_timeout: adds the process time into the target document for profiling - retry_failed: retries running the process function on previously failed documents Finally let's get to the hard part which is running our function. We do this by defining `unary_function` ``` python def unary_function(self,item): return {"a": item["a"] * self.multiplier} ``` Note that we're not returning all the extra information typically kept in the originally item. Normally, we would have to write code that copies over the source `key` and convert it to the target `key`. Same goes for the `last_updated_field`. `MapBuilder` takes care of this, while also recording errors, processing time, and the Builder version. maggma-0.70.0/docs/getting_started/mongodb.md000066400000000000000000000047441470132070100211210ustar00rootroot00000000000000# Setting up MongoDB Many users find MongoDB to best suit their data storage needs. While MongoDB [can be installed locally](https://www.mongodb.com/docs/manual/installation/), the easiest route is often to create a Mongo database via a cloud storage solution called [MongoDB Atlas](https://www.mongodb.com/atlas), which has a free tier. The setup instructions for using Maggma with MongoDB Atlas are described below: 1. Sign up for a free account on [MongoDB Atlas](https://www.mongodb.com/atlas). 2. Once logged in, select the "Create a Project" option and give your project a name (e.g. "MyProject"). Add your email address as the Project Owner. 3. Click the "Build a Database" button under the "Deployment > Database" section and choose the free (i.e. M0) option. Give your cluster a unique name (e.g. "MyCluster"). 4. Select "Create" and enter your desired login credentials that you will use to access your database. You are probably best off not using special characters here since it will be URL-encoded. You should also use different credentials than your usual, since it's not uncommon to share credentials with trusted colleagues. Select "Finish and Close" when done. 5. Go to the "Collections" tab of your cluster, which is where you will create a database (e.g. "my_database") and corresponding data collection (e.g. "my_collection") by clicking the "Add My Own Data" button. 6. Under the "Security > Network Access" section, edit the IP Access List to allow access from anywhere for maximum flexibility. 7. Finally, retrieve your MongoDB URI, which is the address of your MongoDB cluster. You can find your database's URI by clicking the "Database" section in the sidebar and then selecting "Connect > Compass" and copying the link of the form `mongodb+srv://:@`. To test that you can connect to your database, run the following code: ```python from maggma.stores import MongoURIStore # Define your database credentials store = MongoURIStore( "mongodb+srv://:@", "my_collection", database="my_database", ) # Query the database with store: print(store.count()) ``` !!! Note If you are using a self-hosted Mongo database, you will probably want to use a [`MongoStore`](https://materialsproject.github.io/maggma/reference/stores/#maggma.stores.mongolike.MongoStore) instead of the [`MongoURIStore`](https://materialsproject.github.io/maggma/reference/stores/#maggma.stores.mongolike.MongoURIStore), which takes slightly different arguments. maggma-0.70.0/docs/getting_started/query_101.md000066400000000000000000000144671470132070100212250ustar00rootroot00000000000000# Understanding Queries Putting your data into a `maggma` `Store` gives you powerful search, summary, and analytical capabilities. All are based on "queries", which specify how you want to search your data, and which parts of it you want to get in return. `maggma` query syntax closely follows [MongoDB Query syntax](https://www.mongodb.com/docs/manual/tutorial/query-documents/). In this tutorial, we'll cover the syntax of the most common query operations. You can refer to the [MongoDB](https://www.mongodb.com/docs/manual/tutorial/query-documents/) or [pymongo](https://pymongo.readthedocs.io/en/stable/tutorial.html) (python interface to MongoDB) documentation for examples of more advanced use cases. Let's create an example dataset describing the [Teenage Mutant Ninja Turtles](https://en.wikipedia.org/wiki/Teenage_Mutant_Ninja_Turtles). ```python >>> turtles = [{"name": "Leonardo", "color": "blue", "tool": "sword", "occupation": "ninja" }, {"name": "Donatello", "color": "purple", "tool": "staff", "occupation": "ninja" }, {"name": "Michelangelo", "color": "orange", "tool": "nunchuks", "occupation": "ninja" }, {"name":"Raphael", "color": "red", "tool": "sai", "occupation": "ninja" }, {"name":"Splinter", "occupation": "sensei" } ] ``` Notice how this data follows the principles described in [Structuring `Store` data](stores.md/#structuring-store-data): - every document (`dict`) has a `name` key with a unique value - every document has a common set of keys (`name`, `occupation`). - Note that SOME documents also share the keys `tool` and `color`, but not all. This is OK. For the rest of this tutorial, we will assume that this data has already been added to a `Store` called `tmnt_store`, which we are going to query. ## The `query` method `Store.query()` is the primary method you will use to search your data. - `query` always returns a generator yielding any and all documents that match the query you provide. - There are no mandatory arguments. If you run `query()` you will get a generator containing all documents in the `Store` - The first (optional) argument is `criteria`, which is a query formatted as a `dict` as described in the next section. - You can also specify `properties`, which is a list of fields from the documents you want to return. This is useful when working with large documents because then you only have to download the data you need rather than the entire document. - You can also `skip` every N documents, `limit` the number of documents returned, and `sort` the result by some field. Since `query` returns a generator, you will typically want to turn the results into a list, or use them in a `for` loop. Turn into a list ```python results = [d for d in store.query()] ``` Use in a `for` loop ```python for doc in store.query(): print(doc) ``` ## The structure of a query A query is also a `dict`. Each key in the dict corresponds to a fjeld in the documents you want to query (such as `name`, `color`, etc.), and the value is the value of that key that you want to match. For example, a query to select all documents where `occupation` is `ninja`, would look like ```python {"occupation": "ninja"} ``` This query will be passed as an argument to `Store` methods like `query_one`, `query`, and `count`, as demonstrated next. ## Example queries ### Match a single value To select all records where a field matches a single value, set the key to the field you want to match and its value to the value you are looking for. Return all records where 'occupation' is 'ninja' ```python >>> with tmnt_store as store: ... results = list(store.query({"occupation": "ninja"})) >>> len(results) 4 ``` Return all records where 'name' is 'Splinter' ```python >>> with tmnt_store as store: ... results = list(store.query({"name": "Splinter"})) >>> len(results) 1 ``` ### Match any value in a list: `$in` To find all documents where a field matches one of several different values, use `$in` with a list of the value you want to search. ```python >>> with tmnt_store as store: ... results = list(store.query({"color": {"$in": ["red", "blue"]}})) >>> len(results) 2 ``` `$in` is an example of a "query operator". Others include: - `$nin`: a value is NOT in a list (the inverse of the above example) - `$gt`, `$gte`: greater than, greater than or equal to a value - `$lt`, `$lte`: greater than, greater than or equal to a value - `$ne`: not equal to a value - `$not`: inverts the effect of a query expression, returning results that do NOT match. See the [MongoDB docs](https://www.mongodb.com/docs/manual/reference/operator/query/#query-selectors) for a complete list. !!! Note When using query operators like `$in`, you must include a nested `dict` in your query, where the operator is the key and the search parameters are the value, e.g., the dictionary `{"$in": ["red", "blue"]}` is the **value** associated with the search field (`color`) in the parent dictionary. ### Nested fields Suppose that our documents had a nested structure, for example, by having separate fields for first and last name: ```python >>> turtles = [{"name": {"first": "Leonardo", "last": "turtle" }, "color": "blue", "tool": "sword", "occupation": "ninja" }, ... ] ``` You can query nested fields by placing a period `.` between each level in the hierarchy. For example: ```python >>> with tmnt_store as store: ... results = list(store.query({"name.first": "Splinter"})) >>> len(results) 1 ``` ### Numerical Values You can query numerical values in analogous fashion to the examples given above. !!! Note When querying on numerical values, be mindful of the `type` of the data. Data stored in `json` format is often converted entirely to `str`, so if you use a numerical query operator like `$gte`, you might not get the results you expect unless you first verify that the numerical data in the `Store` is a `float` or `int` . maggma-0.70.0/docs/getting_started/running_builders.md000066400000000000000000000166631470132070100230500ustar00rootroot00000000000000# Running Builders `maggma` is designed to run build-pipelines in a production environment. Builders can be run directly in a python environment, but this gives you none of the performance features such as multiprocessing. The base `Builder` class implements a simple `run` method that can be used to run that builder: ``` python class MultiplyBuilder(Builder): """ Simple builder that multiplies the "a" sub-document by pre-set value """ ... my_builder = MultiplyBuilder(source_store,target_store,multiplier=3) my_builder.run() ``` A better way to run this builder would be to use the `mrun` command line tool. Since everything in `maggma` is MSONable, we can use `monty` to dump the builders into a JSON file: ``` python from monty.serialization import dumpfn dumpfn(my_builder,"my_builder.json") ``` Then we can run the builder using `mrun`: ``` shell mrun my_builder.json ``` `mrun` has a number of useful options: ``` shell mrun --help Usage: mrun [OPTIONS] [BUILDERS]... Options: -v, --verbose Controls logging level per number of v's -n, --num-workers INTEGER RANGE Number of worker processes. Defaults to single processing --help Show this message and exit. ``` We can use the `-n` option to control how many workers run `process_items` in parallel. Similarly, `-v` controls the logging verbosity from just WARNINGs to INFO to DEBUG output. The result will be something that looks like this: ``` shell 2020-01-08 14:33:17,187 - Builder - INFO - Starting Builder Builder 2020-01-08 14:33:17,217 - Builder - INFO - Processing 100 items Get: 100%|██████████████████████████████████| 100/100 [00:00<00:00, 15366.00it/s] 2020-01-08 14:33:17,235 - MultiProcessor - INFO - Processing batch of 1000 items Update Targets: 100%|█████████████████████████| 100/100 [00:00<00:00, 584.51it/s] Process Items: 100%|██████████████████████████| 100/100 [00:00<00:00, 567.39it/s] ``` There are progress bars for each of the three steps, which lets you understand what the slowest step is and the overall progress of the system. ## Running Distributed `maggma` can distribute work across multiple computers. There are two steps to this: 1. Run a `mrun` manager by providing it with a `--url` to listen for workers on and `--num-chunks`(`-N`) which tells `mrun` how many sub-pieces to break up the work into. You can can run fewer workers then chunks. This will cause `mrun` to call the builder's `prechunk` to get the distribution of work and run distributed work on all workers 2. Run `mrun` workers b y providing it with a `--url` to listen for a manager and `--num-workers` (`-n`) to tell it how many processes to run in this worker. The `url` argument takes a fully qualified url including protocol. `tcp` is recommended: Example: `tcp://127.0.0.1:8080` ## Running Scripts `mrun` has the ability to run Builders defined in python scripts or in jupyter-notebooks. The only requirements are: 1. The builder file has to be in a sub-directory from where `mrun` is called. 2. The builders you want to run are in a variable called `__builder__` or `__builders__` `mrun` will run the whole python/jupyter file, grab the builders in these variables and adds these builders to the builder queue. Assuming you have a builder in a python file: `my_builder.py` ``` python class MultiplyBuilder(Builder): """ Simple builder that multiplies the "a" sub-document by pre-set value """ ... __builder__ = MultiplyBuilder(source_store,target_store,multiplier=3) ``` You can use `mrun` to run this builder and parallelize for you: ``` shell mrun -n 2 -v my_builder.py ``` ## Running Multiple Builders `mrun` can run multiple builders. You can have multiple builders in a single file: `json`, `python`, or `jupyter-notebook`. Or you can chain multiple files in the order you want to run them: ``` shell mrun -n 32 -vv my_first_builder.json builder_2_and_3.py last_builder.ipynb ``` `mrun` will then execute the builders in these files in order. ## Reporting Build State `mrun` has the ability to report the status of the build pipeline to a user-provided `Store`. To do this, you first have to save the `Store` as a JSON or YAML file. Then you can use the `-r` option to give this to `mrun`. It will then periodically add documents to the `Store` for one of 3 different events: * `BUILD_STARTED` - This event tells us that a new builder started, the names of the `sources` and `targets` as well as the `total` number of items the builder expects to process * `UPDATE` - This event tells us that a batch of items was processed and is going to `update_targets`. The number of items is stored in `items`. * `BUILD_ENDED` - This event tells us the build process finished this specific builder. It also indicates the total number of `errors` and `warnings` that were caught during the process. These event docs also contain the `builder`, a `build_id` which is unique for each time a builder is run and anonymous but unique ID for the machine the builder was run on. ## Profiling Memory Usage of Builders `mrun` can optionally profile the memory usage of a running builder by using the Memray Python memory profiling tool ([Memray](https://github.com/bloomberg/memray)). To get started, Memray should be installed in the same environment as `maggma` using `pip install memray` (r `pip install maggma[memray]`). Setting the `--memray` (`-m`) option to `on`, or `True`, will signal `mrun` to profile the memory usage of any builders passed to `mrun` as the builders are running. The profiler also supports profiling of both single and forked processes. For example, spawning multiple processes in `mrun` with `-n` will signal the profiler to track any forked child processes spawned from the parent process. A basic invocation of the memory profiler using the `mrun` command line tool would look like this: ``` shell mrun --memray on my_builder.json ``` The profiler will generate two files after the builder finishes: 1. An output `.bin` file that is dumped by default into the `temp` directory, which is platform/OS dependent. For Linux/MacOS this will be `/tmp/` and for Windows the target directory will be `C:\TEMP\`.The output file will have a generic naming pattern as follows: `BUILDER_NAME_PASSED_TO_MRUN + BUILDER_START_DATETIME_ISO.bin`, e.g., `my_builder.json_2023-06-09T13:57:48.446361.bin`. 2. A `.html` flamegraph file that will be written to the same directory as the `.bin` dump file. The flamegraph will have a naming pattern similar to the following: `memray-flamegraph-my_builder.json_2023-06-09T13:57:48.446361.html`. The flamegraph can be viewed using any web browser. ***Note***: Different platforms/operating systems purge their system's `temp` directory at different intervals. It is recommended to move at least the `.bin` file to a more stable location. The `.bin` file can be used to recreate the flamegraph at anytime using the Memray CLI. Using the flag `--memray-dir` (`-md`) allows for specifying an output directory for the `.bin` and `.html` files created by the profiler. The provided directory will be created if the directory does not exist, mimicking the `mkdir -p` command. Further data visualization and transform examples can be found in Memray's documentation ([Memray reporters](https://bloomberg.github.io/memray/live.html)). maggma-0.70.0/docs/getting_started/simple_builder.md000066400000000000000000000203031470132070100224600ustar00rootroot00000000000000# Writing a Builder ## Builder Architecture A `Builder` is a class that inherits from `maggma.core.Builder` and implement 3 methods: * `get_items`: This method should return some iterable of items to run through `process_items` * `process_item`: This method should take a single item, process it, and return the processed item * `update_targets`: This method should take a list of processed items and update the target stores. To make this less abstract, we will write a builder that multiplies the "a" sub-document by a pre-configured `multiplier`. Let's assume we have some source collection in MongoDB with documents that look like this: ``` json { "id": 1, "a": 3, "last_updated": "2019-11-3" } ``` ## Class definition and `__init__` A simple class definition for a Maggma-based builder looks like this: ``` python from maggma.core import Builder from maggma.core import Store class MultiplyBuilder(Builder): """ Simple builder that multiplies the "a" sub-document by pre-set value """ ``` The `__init__` for a builder can have any set of parameters. Generally, you want a source `Store` and a target `Store` along with any parameters that configure the builder. Due to the `MSONable` pattern, any parameters to `__init__` have to be stored as attributes. A simple `__init__` would look like this: ``` python def __init__(self, source: Store, target: Store, multiplier: int = 2, **kwargs): """ Arguments: source: the source store target: the target store multiplier: the multiplier to apply to "a" sub-document """ self.source = source self.target = target self.multiplier = multiplier self.kwargs = kwargs super().__init__(sources=source,targets=target,**kwargs) ``` Python type annotations provide a really nice way of documenting the types we expect and being able to later type check using `mypy`. We defined the type for `source` and `target` as `Store` since we only care that implements that pattern. How exactly these `Store`s operate doesn't concern us here. Note that the `__init__` arguments: `source`, `target`, `multiplier`, and `kwargs` get saved as attributes: ``` python self.source = source self.target = target self.multiplier = multiplier self.kwargs = kwargs ``` Finally, we want to call the base `Builder`'s `__init__` to tell it our sources and targets for this builder. In addition, we pass along any extra parameters that might configured the base builder class. ``` python super().__init__(sources=source,targets=target,**kwargs) ``` Calling the parent class `__init__` is a good practice as sub-classing builders is a good way to encapsulate complex logic. ## `get_items` `get_items` is conceptually a simple method to implement, but in practice can easily be more code than the rest of the builder. All of the logic for getting data from the sources has to happen here, which requires some planning. `get_items` should also sort all of the data into individual **items** to process. This simple builder has a very easy `get_items`: ``` python def get_items(self) -> Iterator: """ Gets induvidual documents to multiply """ return self.source.query() ``` Here, get items just returns the results of `query()` from the store. It could also have been written as a generator: ``` python def get_items(self) -> Iterable: """ Gets induvidual documents to multiply """ for doc in self.source.query(): yield doc ``` We could have also returned a list of items: ``` python def get_items(self) -> Iterable: """ Gets induvidual documents to multiply """ docs = list(self.source.query()) ``` One advantage of using the generator approach is it is less memory intensive than the approach where a list of items returned. For large datasets, returning a list of all items for processing may be prohibitive due to memory constraints. ## `process_item` `process_item` just has to do the parallelizable work on each item. Since the item is whatever comes out of `get_items`, you know exactly what it should be. It may be a single document, a list of documents, a mapping, a set, etc. Our simple process item just has to multiply one field by `self.multiplier`: ``` python def process_items(self, item : Dict) -> Dict: """ Multiplies the "a" sub-document by self.multiplier """ new_item = dict(**item) new_item["a"] *= self.multiplier return new_item ``` ## `update_targets` Finally, we have to put the processed item in to the target store: ``` python def update_targets(self,items: List[Dict]): """ Adds the processed items into the target store """ self.target.update(items) ``` !!! note Note that whatever `process_items` returns, `update_targets` takes a `List` of these: For instance, if `process_items` returns `str`, then `update_targets` would look like: ``` python def update_target(self,items: List[str]): ``` Putting it all together we get: ``` python from typing import Dict, Iterable, List from maggma.core import Builder from maggma.core import Store class MultiplyBuilder(Builder): """ Simple builder that multiplies the "a" sub-document by pre-set value """ def __init__(self, source: Store, target: Store, multiplier: int = 2, **kwargs): """ Arguments: source: the source store target: the target store multiplier: the multiplier to apply to "a" sub-document """ self.source = source self.target = target self.multiplier = multiplier self.kwargs = kwargs super().__init__(sources=source,targets=target,**kwargs) def get_items(self) -> Iterable: """ Gets induvidual documents to multiply """ docs = list(self.source.query()) def process_items(self, item : Dict) -> Dict: """ Multiplies the "a" sub-document by self.multiplier """ new_item = dict(**item) new_item["a"] *= self.multiplier return new_item def update_targets(self,items: List[Dict]): """ Adds the processed items into the target store """ self.target.update(items) ``` ## Distributed Processing `maggma` can distribute a builder across multiple computers. The `Builder` must have a `prechunk` method defined. `prechunk` should do a subset of `get_items` to figure out what needs to be processed and then return dictionaries that modify the `Builder` in-place to only work on each subset. For example, if in the above example we'd first have to update the builder to be able to work on a subset of keys. One pattern is to define a generic `query` argument for the builder and use that in get items: ``` python def __init__(self, source: Store, target: Store, multiplier: int = 2, query: Optional[Dict] = None, **kwargs): """ Arguments: source: the source store target: the target store multiplier: the multiplier to apply to "a" sub-document """ self.source = source self.target = target self.multiplier = multiplier self.query = query self.kwargs = kwargs super().__init__(sources=source,targets=target,**kwargs) def get_items(self) -> Iterable: """ Gets induvidual documents to multiply """ query = self.query or {} docs = list(self.source.query(criteria=query)) ``` Then we can define a prechunk method that modifies the `Builder` dict in place to operate on just a subset of the keys: ``` python from maggma.utils import grouper def prechunk(self, number_splits: int) -> Iterable[Dict]: keys = self.source.distinct(self.source.key) for split in grouper(keys, N): yield { "query": {self.source.key: {"$in": list(split)}} } ``` When distributed processing runs, it will modify the `Builder` dictionary in place by the prechunk dictionary. In this case, each builder distribute to a worker will get a modified `query` parameter that only runs on a subset of all possible keys. maggma-0.70.0/docs/getting_started/stores.md000066400000000000000000000210201470132070100207750ustar00rootroot00000000000000# Using `Store` A `Store` is just a wrapper to access data from a data source. That data source is typically a MongoDB collection, but it could also be an Amazon S3 bucket, a GridFS collection, or folder of files on disk. `maggma` makes interacting with all of these data sources feel the same (see the [`Store` interface](#the-store-interface), below). `Store` can also perform logic, concatenating two or more `Store` together to make them look like one data source for instance. The benefit of the `Store` interface is that you only have to write a `Builder` once. As your data moves or evolves, you simply point it to different `Store` without having to change your processing code. ## Structuring `Store` data Because `Store` is built around a MongoDB-like query syntax, data that goes into `Store` needs to be structured similarly to MongoDB data. In python terms, that means **the data in a `Store` must be structured as a `list` of `dict`**, where each `dict` represents a single record (called a 'document'). ```python data = [{"AM": "sunrise"}, {"PM": "sunset"} ... ] ``` Note that this structure is very similar to the widely-used [JSON](https://en.wikipedia.org/wiki/JSON) format. So structuring your data in this manner enables highly flexible storage options -- you can easily write it to a `.json` file, place it in a `Store`, insert it into a Mongo database, etc. `maggma` is designed to facilitate this. In addition to being structured as a `list` of `dict`, **every document (`dict`) must have a key that uniquely identifies it.** By default, this key is the `task_id`, but it can be set to any value you like using the `key` argument when you instantiate a `Store`. ```python data = [{"task_id": 1, "AM": "sunrise"}, {"task_id: 2, "PM": "sunset"} ... ] ``` Just to emphasize - **every document must have a `task_id`, and the value of `task_id` must be unique for every document**. The rest of the document structure is up to you, but `maggma` works best when every document follows a pre-defined schema (i.e., all `dict` have the same set of keys / same structure). ## The `Store` interface All `Store` provide a number of basic methods that facilitate querying, updating, and removing data: - `query`: Standard mongo style `find` method that lets you search the store. See [Understanding Queries](query_101.md) for more details about the query syntax. - `query_one`: Same as above but limits returned results to just the first document that matches your query. Very useful for understanding the structure of the returned data. - `count`: Counts documents in the `Store` - `distinct`: Returns a list of distinct values of a field. - `groupby`: Similar to query but performs a grouping operation and returns sets of documents. - `update`: Update (insert) documents into the `Store`. This will overwrite documents if the key field matches. - `remove_docs`: Removes documents from the underlying data source. - `newer_in`: Finds all documents that are newer in the target collection and returns their `key`s. This is a very useful way of performing incremental processing. - `ensure_index`: Creates an index for the underlying data-source for fast querying. - `last_updated`: Finds the most recently updated `last_updated_field` value and returns that. Useful for knowing how old a data-source is. !!! Note If you are familiar with `pymongo`, you may find the comparison table below helpful. This table illustrates how `maggma` method and argument names map onto `pymongo` concepts. | `maggma` | `pymongo` equivalent | | -------- | ------- | | **methods** | | `query_one` | `find_one` | | `query` | `find` | | `count` | `count_documents` | | `distinct` | `distinct` | | `groupby` | `group` | | `update` | `insert` | | **arguments** | | `criteria={}` | `filter={}` | | `properties=[]` | `projection=[]` | ## Creating a Store All `Store`s have a few basic arguments that are critical for basic usage. Every `Store` has two attributes that the user should customize based on the data contained in that store: `key` and `last_updated_field`. The `key` defines how the `Store` tells documents apart. Typically this is `_id` in MongoDB, but you could use your own field (be sure all values under the key field can be used to uniquely identify documents). `last_updated_field` tells `Store` how to order the documents by a date, which is typically in the `datetime` format, but can also be an ISO 8601-format (ex: `2009-05-28T16:15:00`) `Store`s can also take a `Validator` object to make sure the data going into it obeys some schema. In the example below, we create a `MongoStore`, which connects to a MongoDB database. To create this store, we have to provide `maggma` the connection details to the database like the hostname, collection name, and authentication info. Note that we've set `key='name'` because we want to use that `name` as our unique identifier. ```python >>> store = MongoStore(database="my_db_name", collection_name="my_collection_name", username="my_username", password="my_password", host="my_hostname", port=27017, key="name", ) ``` The specific arguments required to create a `Store` depend on the underlying format. For example, the `MemoryStore`, which just loads data into memory, requires no arguments to instantiate. Refer to the [list of Stores](#list-of-stores) below (and their associated documentation) for specific details. ## Connecting to a `Store` You must connect to a store by running `store.connect()` before querying or updating the store. If you are operating on the stores inside of another code it is recommended to use the built-in context manager, e.g.: ```python with MongoStore(...) as store: store.query() ``` This will take care of the `connect()` automatically while ensuring that the connection is closed properly after the store tasks are complete. ## List of Stores Current working and tested `Store` include the following. Click the name of each store for more detailed documentation. - [`MongoStore`](/maggma/reference/stores/#maggma.stores.mongolike.MongoStore): interfaces to a MongoDB Collection using port and hostname. - [`MongoURIStore`](/maggma/reference/stores/#maggma.stores.mongolike.MongoURIStore): interfaces to a MongoDB Collection using a "mongodb+srv://" URI. - [`MemoryStore`](/maggma/reference/stores/#maggma.stores.mongolike.MemoryStore): just a Store that exists temporarily in memory - [`JSONStore`](/maggma/reference/stores/#maggma.stores.mongolike.JSONStore): builds a MemoryStore and then populates it with the contents of the given JSON files - [`FileStore`](/maggma/reference/stores/#maggma.stores.file_store.FileStore): query and add metadata to files stored on disk as if they were in a database - [`GridFSStore`](/maggma/reference/stores/#maggma.stores.gridfs.GridFSStore): interfaces to GridFS collection in MongoDB using port and hostname. - [`GridFSURIStore`](/maggma/reference/stores/#maggma.stores.gridfs.GridFSURIStore): interfaces to GridFS collection in MongoDB using a "mongodb+srv://" URI. - [`S3Store`](/maggma/reference/stores/#maggma.stores.aws.S3Store): provides an interface to an S3 Bucket either on AWS or self-hosted solutions ([additional documentation](advanced_stores.md)) - [`ConcatStore`](/maggma/reference/stores/#maggma.stores.compound_stores.ConcatStore): concatenates several Stores together so they look like one Store - [`VaultStore`](/maggma/reference/stores/#maggma.stores.advanced_stores.VaultStore): uses Vault to get credentials for a MongoDB database - [`AliasingStore`](/maggma/reference/stores/#maggma.stores.advanced_stores.AliasingStore): aliases keys from the underlying store to new names - `SandboxStore: provides permission control to documents via a `_sbxn` sandbox key - [`JointStore`](/maggma/reference/stores/#maggma.stores.compound_stores.JointStore): joins several MongoDB collections together, merging documents with the same `key`, so they look like one collection - [`AzureBlobStore`](/maggma/reference/stores/#maggma.stores.azure.AzureBlobStore): provides an interface to Azure Blobs for the storage of large amount of data - [`MontyStore`](/maggma/reference/stores/#maggma.stores.mongolike.MontyStore): provides an interface to [montydb](https://github.com/davidlatwe/montydb) for in-memory or filesystem-based storage - [`MongograntStore`](/maggma/reference/stores/#maggma.stores.advanced_stores.MongograntStore): (DEPRECATED) uses Mongogrant to get credentials for MongoDB database maggma-0.70.0/docs/getting_started/using_file_store.md000066400000000000000000000241161470132070100230270ustar00rootroot00000000000000# Using `FileStore` for files on disk The first step in any `maggma` pipeline is creating a `Store` so that data can be queried and transformed. Often times your data will originate as files on disk (e.g., calculation output files, files generated by instruments, etc.). `FileStore` provides a convenient way to access this type of data as if it were in a database, making it possible to `query`, add metadata, and run `Builder` on it. Suppose you have some data files organized in the following directory structure: ![Example directory structure](file_store_dir_structure.png){ width="300" } ## Creating the `FileStore` To create a `Filestore`, simply pass the path to the top-level directory that contains the files. ```python >>> fs = FileStore('/path/to/file_store_test/') >>> fs.connect() ``` On `connect()`, `FileStore` iterates through all files in the base directory and all subdirectories. For each file, it creates dict-like record based on the file's metadata such as name, size, last modification date, etc. These records are kept in memory using an internal `MemoryStore`. An example record is shown below. ```python {'_id': ObjectId('625e581113cef6275a992abe'), 'name': 'input.in', 'path': '/test_files/file_store_test/calculation1/input.in', 'parent': 'calculation1', 'size': 90, 'file_id': '2d12e9803fa0c6eaffb065c8dc3cf4fe', 'last_updated': datetime.datetime(2022, 4, 19, 5, 23, 54, 109000), 'hash': 'd42c9ff24dc2fde99ed831ec767bd3fb', 'orphan': False, 'contents': 'This is the file named input.in\nIn directory calculation1\nin the FileStore test directory.'} ``` ### Choosing files to index To restrict which files are indexed by the Store (which can improve performance), the optional keyword arguments `max_depth` and `file_filters` can be used. For example, to index only files ending in ".in", use ```python >>> fs = FileStore('/path/to/my/data', file_filters=["*.in"]) ``` You can pass multiple `file_filters` and use regex-like [fnmatch](https://docs.python.org/3/library/fnmatch.html) patterns as well. For example, to index all files ending in ".in" or named "test-X.txt" where X is any single letter between a and d, use ```python >>> fs = FileStore('/path/to/my/data', file_filters=["*.in","test-[abcd].txt"]) ``` If you only want to index the root directory and exclude all subdirectories, use `max_depth=0`, e.g. ```python >>> fs = FileStore('/path/to/my/data', max_depth=0) ``` ### Write access By default, the `FileStore` is read-only. However you can set `read_only=False` if you want to add additional metadata to the data (See ["Adding Metadata"](#adding-metadata) below). This metadata is stored in a .json file placed in the root directory of the `FileStore` (the name of the file can be customized with the `json_name` keyword argument.) ```python >>> fs = FileStore('/path/to/my/data', read_only=False, json_name='my_store.json') ``` Several methods that modify the contents of the `FileStore` such as `add_metadata`, `update`, and `remove_docs` will not work unless the store is writable (i.e., `read_only=False`). ### File identifiers (`file_id`) Each file is uniquely identified by a `file_id` key, which is computed from the hash of the file's path relative to the base `FileStore` directory. Unique identifiers for every file are necessary to enable `Builder` to work correctly and for associating custom metadata (See ["Adding Metadata"](#adding-metadata) below). By using the relative path instead of the absolute path makes it possible to move the entire `FileStore` to a new location on disk without changing `file_id` (as long as the relative paths don't change). ## Connecting and querying As with any `Store`, you have to `connect()` before you can query any data from a `FileStore`. After that, you can use `query_one()` to examine a single document or `query()` to return an iterator of matching documents. For example, let's print the parent directory of each of the files named "input.in" in our example `FileStore`: ```python >>> fs.connect() >>> [d["parent"] for d in fs.query({"name":"input.in"})] ['calculation2', 'calculation1'] ``` ### Performance **_NOTE_** `FileStore` can take a long time to `connect()` when there are more than a few hundred files in the directory. This is due to limitations of the `mongomock` package that powers the internal `MemoryStore`. We hope to identify a more performant alternative in the near future. In the mean time, use `file_filters` and `max_depth` to limit the total number of files in the `FileStore`. ### File Contents When you `query()` data, `FileStore` attempts to read the contents of each matching file and include them in the `contents` key of the returned dictionary, as you can see in the example above. There is an optional keyword argument `contents_size_limit` which specifies the maximum size of file that `FileStore` will attempt to read. At present, this only works with text files and the entire file contents are returned as a single string. If a file is too large to read, or if `FileStore` was unable to open the file (because it is a binary file, etc.), then you will see `contents` populated with a message that beings with `"Unable to read:`. **This behavior may change in the future.** ## Adding metadata As long as a store is not read-only (see #write-access), you can `update()` documents in it just like any other `Store`. This is a great way to associate additional information with raw data files. For example, if you have a store of files generated by an instrument, you can add metadata related to the environmental conditions, the sample that was tested, etc. ### `update` method You can use `update()` to add keys to the `FileStore` records. For example, to add some tags to the files named "input.in", use: ```python docs = [d for d in fs.query({"name":"input.in"})] for d in docs: d["tags"] = ["preliminary"] fs.update(docs) ``` The above steps will result in the following contents being added to the .json file. This metadata will be automatically read back in next time you connect to the Store. ```json [{"path":".../file_store_test/calculation2/input.in", "file_id":"3c3012f84c162e9ff9bb834c53dd1f58", "tags":["preliminary"]}, {"path":".../file_store_test/calculation1/input.in", "file_id":"fde43ea119034eb8732d6f3f0d9802ce", "tags":["preliminary"]}] ``` Notice that only the items modified with extra keys are written to the JSON (i.e., if you have 10 items in the store but add metadata to just two, only the two items will be written to the JSON). The purpose of this behavior is to prevent any duplication of data. The `file_id` and `path` are retained in the JSON file to make each metadata record manually identifiable. ### `add_metadata` convenience method A more convenient way to add metadata is via the `add_metadata` method. To use it, just pass a query to identify the documents you want to update, and a dict to add to the document. Here is what the [example above](#the-update-method) would look like using `add_metadata` ```python fs.add_metadata({"name":"input.in"}, {"tags":["preliminary"]}) ``` ### Automatic metadata You can even define a function to automatically create metadata from file or directory names. For example, if you prefix all your files with datestamps (e.g., '2022-05-07_experiment.csv'), you can write a simple string parsing function to extract information from any key in a `FileStore` record and pass the function as an argument to `add_metadata`. For example, to extract the date from files named like '2022-05-07_experiment.csv' and add it to the 'date' field: ```python >>> def get_date_from_filename(d): """ Args: d: An item returned from the `FileStore` """ return {"date": d["name"].split("_")[0], "test_name": d["name"].split("_")[1] } >>> fs.add_metadata({}, auto_data=get_date_from_filename) ``` ### Protected Keys Note that when using any of the above methods, you cannot modify any keys that are populated by default (e.g. `name`, `parent`, `file_id`), because they are derived directly from the files on disk. ### Orphaned Metadata In the course of working with `FileStore` you may encounter a situation where there are metadata records stored in the JSON file that no longer match files on disk. This can happen if, for example, you init a `FileStore` and later delete a file, or if you init the store with the default arguments but later restrict the file selection with `max_depth` or `file_filters`. These orphaned metadata records will appear in the `FileStore` with the field `{"orphan": True}`. The goal with this behavior is to preserve all metadata the user may have added and prevent data loss. By default, **orphaned metadata is excluded from query results**. There is an `include_orphans` keyword argument you can set on init if you want orphaned metadata to be returned in queries. ## Deleting files For consistency with the `Store` interface, `FileStore` provides the `remove_docs` method whenever `read_only=False`. **This method will delete files on disk**, because `FileStore` documents are simply representations of those files. It has an additional guard argument `confirm` which must be set to the non-default value `True` for the method to actually do anything. ```python >>> fs.remove_docs({"name":"input.in"}) Traceback (most recent call last): File "", line 1, in File ".../maggma/src/maggma/stores/file_store.py", line 496, in remove_docs raise StoreError( maggma.core.store.StoreError: (StoreError(...), 'Warning! This command is about ' 'to delete 2 items from disk! If this is what you want, reissue ' 'this command with confirm=True.') ``` ## Processing files with a `Builder` Now that you can access your files on disk via a `FileStore`, it's time to write a `Builder` to read and process the data (see [Writing a Builder](simple_builder.md)). Keep in mind that `get_items` will return documents like the one shown in (#creating-the-filestore). You can then use `process_items` to - Create structured data from the `contents` - Open the file for reading using a custom piece of code - etc. Once you can process data on your disk with a `Builder`, you can send that data to any kind of `Store` you like - another `FileStore`, a database, etc. maggma-0.70.0/docs/getting_started/using_ssh_tunnel.md000066400000000000000000000060001470132070100230460ustar00rootroot00000000000000# Using `SSHTunnel` to connect to remote database One of the typical scenarios to use `maggma` is to connect to a remote database that is behind a firewall and thus cannot be accessed directly from your local computer (as shown below, [image credits](https://github.com/pahaz/sshtunnel/)). In this case, you can use `SSHTunnel` to first connect to the remote server, and then connect to the database from the server. ``` ---------------------------------------------------------------------- | -------------+ | +----------+ +--------- LOCAL | | | REMOTE | | PRIVATE COMPUTER | <== SSH ========> | SERVER | <== local ==> | SERVER -------------+ | +----------+ +--------- | FIREWALL (only port 22 is open) ---------------------------------------------------------------------- Note, the `local` indicates that the connection to the PRIVATE SERVER can only be made from the REMOTE SERVER. ``` ## Example usage with `S3Store` Below is an example of how to use `SSHTunnel` to connect to an AWS `S3Store` hosted on a private server. Let's assume that, from you local computer, you can ssh to the remote server using the following command with your credentials (e.g. ): ```bash ssh @ ``` and then from the remote server, you can access your database using, e.g., the following information: ``` private_server_address: COMPUTE_NODE_1 private_server_port: 9000 ``` You can create an `SSHTunnel` object as follows: ```python from maggma.stores.ssh_tunnel import SSHTunnel tunnel = SSHTunnel( tunnel_server_address = ":22", username = "", password= "", remote_server_address = "COMPUTE_NODE_1:9000", local_port = 9000, ) ``` and then pass it to the `S3Store` to connect to the database. The arguments of the `SSHTunnel` are self-explanatory, but `local_port` needs more explanation. We assume that on the local computer, we want to connect to the localhost address `http://127.0.0.1`, so we do not need to provide the address, but only the port number (`9000` in this case.) In essence, `SSHTunnel` allows the connection to the database at `COMPUTE_NODE_1:9000` on the private server from the localhost address `http://127.0.0.1:9000` on the local computer as if the database is hosted on the local computer. ## Other use cases Alternative to using `username` and `password` for authentication with the remote server, `SSHTunnel` also supports authentication using SSH keys. In this case, you will need to provide your SSH credentials using the `private_key` argument. Read the docs of the `SSHTunnel` for more information. `SSHTunnel` can also be used with other stores such as `MongoStore`, `MongoURIStore`, and `GridFSStore`. The usage is similar to the example above, but you might need to adjust the arguments to the `SSHTunnel` to match the use case. maggma-0.70.0/docs/index.md000066400000000000000000000124171470132070100154100ustar00rootroot00000000000000 # ![Maggma](logo_w_text.svg) [![Static Badge](https://img.shields.io/badge/documentation-blue?logo=github)](https://materialsproject.github.io/maggma) [![testing](https://github.com/materialsproject/maggma/workflows/testing/badge.svg)](https://github.com/materialsproject/maggma/actions?query=workflow%3Atesting) [![codecov](https://codecov.io/gh/materialsproject/maggma/branch/main/graph/badge.svg)](https://codecov.io/gh/materialsproject/maggma) [![python](https://img.shields.io/badge/Python-3.9+-blue.svg?logo=python&logoColor=white)]() ## What is Maggma Maggma is a framework to build scientific data processing pipelines from data stored in a variety of formats -- databases, Azure Blobs, files on disk, etc., all the way to a REST API. The rest of this README contains a brief, high-level overview of what `maggma` can do. For more, please refer to [the documentation](https://materialsproject.github.io/maggma). ## Installation ### From PyPI Maggma is published on the [Python Package Index](https://pypi.org/project/maggma/). The preferred tool for installing packages from *PyPi* is **pip**. This tool is provided with all modern versions of Python. Open your terminal and run the following command: ``` shell pip install --upgrade maggma ``` ### Direct from `git` If you want to install the latest development version, but do not plan to make any changes to it, you can install as follows: ``` shell pip install git+https://github.com/materialsproject/maggma ``` ### Local Clone You can install Maggma directly from a clone of the [Git repository](https://github.com/materialsproject/maggma). This can be done either by cloning the repo and installing from the local clone, or simply installing directly via **git**. ``` shell git clone https://github.com//materialsproject/maggma cd maggma python setup.py install ``` ## Basic Concepts `maggma`'s core classes -- [`Store`](#store) and [`Builder`](#builder) -- provide building blocks for modular data pipelines. Data resides in one or more `Store` and is processed by a `Builder`. The results of the processing are saved in another `Store`, and so on: ```mermaid flowchart LR     s1(Store 1) --Builder 1--> s2(Store 2) --Builder 2--> s3(Store 3) s2 -- Builder 3-->s4(Store 4) ``` ### Store A major challenge in building scalable data pipelines is dealing with all the different types of data sources out there. Maggma's `Store` class provides a consistent, unified interface for querying data from arbitrary data sources. It was originally built around MongoDB, so it's interface closely resembles `PyMongo` syntax. However, Maggma makes it possible to use that same syntax to query other types of databases, such as Amazon S3, GridFS, or files on disk, [and many others](https://materialsproject.github.io/maggma/getting_started/stores/#list-of-stores). Stores implement methods to `connect`, `query`, find `distinct` values, `groupby` fields, `update` documents, and `remove` documents. The example below demonstrates inserting 4 documents (python `dicts`) into a `MongoStore` with `update`, then accessing the data using `count`, `query`, and `distinct`. ```python >>> turtles = [{"name": "Leonardo", "color": "blue", "tool": "sword"}, {"name": "Donatello","color": "purple", "tool": "staff"}, {"name": "Michelangelo", "color": "orange", "tool": "nunchuks"}, {"name":"Raphael", "color": "red", "tool": "sai"} ] >>> store = MongoStore(database="my_db_name", collection_name="my_collection_name", username="my_username", password="my_password", host="my_hostname", port=27017, key="name", ) >>> with store: store.update(turtles) >>> store.count() 4 >>> store.query_one({}) {'_id': ObjectId('66746d29a78e8431daa3463a'), 'name': 'Leonardo', 'color': 'blue', 'tool': 'sword'} >>> store.distinct('color') ['purple', 'orange', 'blue', 'red'] ``` ### Builder Builders represent a data processing step, analogous to an extract-transform-load (ETL) operation in a data warehouse model. Much like `Store` provides a consistent interface for accessing data, the `Builder` classes provide a consistent interface for transforming it. `Builder` transformation are each broken into 3 phases: `get_items`, `process_item`, and `update_targets`: 1. `get_items`: Retrieve items from the source Store(s) for processing by the next phase 2. `process_item`: Manipulate the input item and create an output document that is sent to the next phase for storage. 3. `update_target`: Add the processed item to the target Store(s). Both `get_items` and `update_targets` can perform IO (input/output) to the data stores. `process_item` is expected to not perform any IO so that it can be parallelized by Maggma. Builders can be chained together into an array and then saved as a JSON file to be run on a production system. ## Origin and Maintainers Maggma has been developed and is maintained by the [Materials Project](https://materialsproject.org/) team at Lawrence Berkeley National Laboratory and the [Materials Project Software Foundation](https://github.com/materialsproject/foundation). Maggma is written in [Python](http://docs.python-guide.org/en/latest/) and supports Python 3.9+. maggma-0.70.0/docs/logo_w_text.svg000066400000000000000000000143511470132070100170310ustar00rootroot00000000000000 Maggma maggma-0.70.0/docs/quickstart.md000066400000000000000000000070351470132070100164730ustar00rootroot00000000000000# 5-minute `maggma` quickstart ## Install Open your terminal and run the following command. ``` shell pip install --upgrade maggma ``` ## Format your data Structure your data as a `list` of `dict` objects, where each `dict` represents a single record (called a 'document'). Below, we've created some data to represent info about the Teenage Mutant Ninja Turtles. ```python >>> turtles = [{"name": "Leonardo", "color": "blue", "tool": "sword"}, {"name": "Donatello","color": "purple", "tool": "staff"}, {"name": "Michelangelo", "color": "orange", "tool": "nunchuks"}, {"name":"Raphael", "color": "red", "tool": "sai"} ] ``` Structuring your data in this manner enables highly flexible storage options -- you can easily write it to a `.json` file, place it in a `Store`, insert it into a Mongo database, etc. `maggma` is designed to facilitate this. In addition to being structured as a `list` of `dict`, **every document (`dict`) must have a key that uniquely identifies it.** By default, this key is the `task_id`, but it can be set to any value you like using the `key` argument when you instantiate a `Store`. In the example above, `name` can serve as a key because all documents have it, and the values are all unique. See [Using Stores](getting_started/stores.md/#structuring-store-data) for more details on structuring data. ## Create a `Store` `maggma` contains `Store` classes that connect to MongoDB, Azure, S3 buckets, `.json` files, system memory, and many more data sources. Regardless of the underlying storage platform, all `Store` classes implement the same interface for connecting and querying. The simplest store to use is the `MemoryStore`. It simply loads your data into memory and makes it accessible via `Store` methods like `query`, `distinct`, etc. Note that for this particular store, your data is not saved anywhere - once you close it, the data are lost from RAM! Note that in this example, we've set `key='name'` when creating the `Store` because we want to use `name` as our unique identifier. ```python >>> from maggma.stores import MemoryStore >>> store = MemoryStore(key="name") ``` See [Using Stores](getting_started/stores.md/#list-of-stores) for more details on available `Store` classes. ## Connect to the `Store` Before you can interact with a store, you have to `connect()`. This is as simple as ```python store.connect() ``` When you are finished, you can close the connection with `store.close()`. A cleaner (and recommended) way to make sure connections are appropriately closed is to access `Store` through a context manager (a `with` statement), like this: ```python with store as s: s.query() ``` ## Add your data to the `Store` To add data to the store, use `update()`. ```python with store as s: s.update(turtles) ``` ## Query the `Store` Now that you have added your data to a `Store`, you can leverage `maggma`'s powerful API to query and analyze it. Here are some examples: See how many documents the `Store` contains ```python >>> store.count() 4 ``` Query a single document to see its structure ```python >>> store.query_one({}) {'_id': ObjectId('66746d29a78e8431daa3463a'), 'name': 'Leonardo', 'color': 'blue', 'tool': 'sword'} ``` List all the unique values of the `color` field ```python >>> store.distinct('color') ['purple', 'orange', 'blue', 'red'] ``` See [Understanding Queries](getting_started/query_101.md) for more example queries and [the `Store` interface](getting_started/stores.md/#the-store-interface) for more details about available `Store` methods. maggma-0.70.0/docs/reference/000077500000000000000000000000001470132070100157105ustar00rootroot00000000000000maggma-0.70.0/docs/reference/builders.md000066400000000000000000000001031470132070100200350ustar00rootroot00000000000000::: maggma.builders.map_builder ::: maggma.builders.group_builder maggma-0.70.0/docs/reference/core_builder.md000066400000000000000000000000301470132070100206610ustar00rootroot00000000000000::: maggma.core.builder maggma-0.70.0/docs/reference/core_store.md000066400000000000000000000000261470132070100203740ustar00rootroot00000000000000::: maggma.core.store maggma-0.70.0/docs/reference/core_validator.md000066400000000000000000000000321470132070100212220ustar00rootroot00000000000000::: maggma.core.validator maggma-0.70.0/docs/reference/stores.md000066400000000000000000000011631470132070100175520ustar00rootroot00000000000000!!! note Some `Store` classes require extra packages that are not installed by default. Run the following modified installation commands if you want to use these stores: `MongograntStore`: ```shell pip install maggma[mongogrant] ``` `MontyStore`: ```shell pip install maggma[montydb] ``` `VaultStore`: ```shell pip install maggma[vault] ``` ::: maggma.stores.mongolike ::: maggma.stores.file_store ::: maggma.stores.gridfs ::: maggma.stores.aws ::: maggma.stores.azure ::: maggma.stores.advanced_stores ::: maggma.stores.compound_stores ::: maggma.stores.ssh_tunnel maggma-0.70.0/mkdocs.yml000066400000000000000000000032471470132070100150330ustar00rootroot00000000000000site_name: Maggma Documentation site_description: Documentation for Maggma, a files-to-API data pipeline for scientific applications copyright: Built by The Materials Project theme: name: 'material' nav: - Home: index.md - Core Concepts: concepts.md - Quickstart: quickstart.md - User Guide: - Understanding Queries: getting_started/query_101.md - Using Stores: getting_started/stores.md - Working with FileStore: getting_started/using_file_store.md - Writing a Builder: getting_started/simple_builder.md - Running a Builder Pipeline: getting_started/running_builders.md - Advanced Builders: getting_started/advanced_builder.md - Working with MapBuilder: getting_started/map_builder.md - Working with GroupBuilder: getting_started/group_builder.md - Setting up MongoDB: getting_started/mongodb.md - Using SSHTunnel: getting_started/using_ssh_tunnel.md - Reference: Core: Store: reference/core_store.md Builder: reference/core_builder.md Validator: reference/core_validator.md Stores: reference/stores.md Builders: reference/builders.md - Changelog: CHANGELOG.md site_url: https://materialsproject.github.io/maggma/ repo_url: https://github.com/materialsproject/maggma/ markdown_extensions: - pymdownx.tabbed - admonition - codehilite - attr_list - pymdownx.details - pymdownx.superfences: custom_fences: - name: mermaid class: mermaid format: !!python/name:pymdownx.superfences.fence_code_format - pymdownx.inlinehilite - toc: permalink: true plugins: - search - minify - mkdocstrings: handlers: python: paths: [src] maggma-0.70.0/pyproject.toml000066400000000000000000000130721470132070100157410ustar00rootroot00000000000000[project] name = "maggma" readme = "README.md" dynamic = ["version"] description="Framework to develop datapipelines from files on disk to full dissemenation API" authors =[ {name = "The Materials Project", email = "feedback@materialsproject.org"} ] classifiers=[ "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Development Status :: 5 - Production/Stable", "Intended Audience :: Science/Research", "Intended Audience :: System Administrators", "Intended Audience :: Information Technology", "Operating System :: OS Independent", "Topic :: Other/Nonlisted Topic", "Topic :: Database :: Front-Ends", "Topic :: Scientific/Engineering", ] license = {file = "LICENSE"} requires-python = ">=3.9" dependencies = [ "setuptools", "ruamel.yaml>=0.17", "pydantic>=2.0", "pydantic-settings>=2.0.3", "pymongo>=4.2.0", "monty>=2024.5.24", "mongomock>=3.10.0", "pydash>=4.1.0", "jsonschema>=3.1.1", "tqdm>=4.19.6", "pandas>=2.2", "jsonlines>=4.0.0", "aioitertools>=0.5.1", "numpy>=1.26", "pyzmq>=25.1.1", "dnspython>=1.16.0", "sshtunnel>=0.1.5", "msgpack>=0.5.6", "orjson>=3.9.0", "boto3>=1.20.41", "python-dateutil>=2.8.2", ] [project.urls] Docs = "https://materialsproject.github.io/maggma/" Repo = "https://github.com/materialsproject/maggma" Package = "https://pypi.org/project/maggma" [project.scripts] mrun = "maggma.cli:run" [project.optional-dependencies] vasp = ["pymatgen"] vault = ["hvac>=0.9.5"] memray = ["memray>=1.7.0"] montydb = ["montydb>=2.3.12"] mongogrant = ["mongogrant>=0.3.1"] notebook_runner = ["IPython>=8.11", "nbformat>=5.0", "regex>=2020.6"] azure = ["azure-storage-blob>=12.16.0", "azure-identity>=1.12.0"] api = ["fastapi>=0.42.0","uvicorn>=0.18.3"] testing = [ "pytest", "pytest-cov", "pytest-mock", "pytest-asyncio", "pytest-xdist", "pre-commit", "moto>=5.0", # mock_s3 changed to mock_aws in v5 "ruff", "responses<0.22.0", "types-pyYAML", "types-setuptools", "types-python-dateutil", "starlette[full]" ] docs = [ "mkdocs>=1.4.0", "mkdocs-material>=8.3.9", "mkdocs-minify-plugin>=0.5.0", "mkdocstrings[python]>=0.18.1", "jinja2<3.2.0", ] [build-system] requires = ["setuptools>=61.0.0", "setuptools_scm[toml]>=5"] build-backend = "setuptools.build_meta" [tool.setuptools.packages.find] where = ["src"] [tool.setuptools_scm] version_scheme = "no-guess-dev" [tool.black] line-length = 120 [tool.ruff] line-length = 120 src = ["src"] [tool.ruff.lint] select = [ "B", # flake8-bugbear "C4", # flake8-comprehensions "D", # pydocstyle "E", # pycodestyle error "EXE", # flake8-executable "F", # pyflakes "FA", # flake8-future-annotations "FLY", # flynt "I", # isort "ICN", # flake8-import-conventions "ISC", # flake8-implicit-str-concat "PD", # pandas-vet "PERF", # perflint "PIE", # flake8-pie "PL", # pylint "PT", # flake8-pytest-style "PYI", # flakes8-pyi "Q", # flake8-quotes "RET", # flake8-return "RSE", # flake8-raise "RUF", # Ruff-specific rules "SIM", # flake8-simplify "SLOT", # flake8-slots "TCH", # flake8-type-checking "TID", # tidy imports "TID", # flake8-tidy-imports "UP", # pyupgrade "W", # pycodestyle warning "YTT", # flake8-2020 "NPY201", # numpy 2.0 ] ignore = [ "B023", # Function definition does not bind loop variable "B028", # No explicit stacklevel keyword argument found "B904", # Within an except clause, raise exceptions with ... "C408", # unnecessary-collection-call "D105", # Missing docstring in magic method "D205", # 1 blank line required between summary line and description "D212", # Multi-line docstring summary should start at the first line "FA100", # Missing `from __future__ import annotations`, but uses `typing.XXX` TODO "PD011", # pandas-use-of-dot-values "PD901", # pandas-df-variable-name "PT011", # `pytest.raises(XXXError)` is too broad, set the `match` parameter... TODO "PERF203", # try-except-in-loop "PERF401", # manual-list-comprehension (TODO fix these or wait for autofix) "PLR", # pylint refactor "PLW2901", # Outer for loop variable overwritten by inner assignment target "PT013", # pytest-incorrect-pytest-import "RUF012", # Disable checks for mutable class args. This is a non-problem. "SIM105", # Use contextlib.suppress(OSError) instead of try-except-pass ] pydocstyle.convention = "google" isort.split-on-trailing-comma = false [tool.ruff.lint.per-file-ignores] "__init__.py" = ["F401"] "tasks.py" = ["D"] "tests/*" = ["D"] "src/maggma/api/*" = ["B008", "B021", "RET505", "RET506"] "tests/api/*" = ["B017", "B018"] "src/maggma/cli/*" = ["EXE001"] # triggered by ! at top of file [tool.pytest.ini_options] minversion = "6.0" addopts = "--color=yes -p no:warnings --import-mode=importlib --durations=30" testpaths = [ "tests", ] [tool.mypy] ignore_missing_imports = true namespace_packages = true explicit_package_bases = true no_implicit_optional = false [tool.codespell] ignore-words-list = "ot,nin" skip = 'docs/CHANGELOG.md,tests/test_files/*' maggma-0.70.0/requirements/000077500000000000000000000000001470132070100155455ustar00rootroot00000000000000maggma-0.70.0/requirements/macos-latest_py3.10.txt000066400000000000000000000050221470132070100217130ustar00rootroot00000000000000# # This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile --output-file=requirements/macos-latest_py3.10.txt # aioitertools==0.12.0 # via maggma (pyproject.toml) annotated-types==0.7.0 # via pydantic attrs==24.2.0 # via # jsonlines # jsonschema # referencing bcrypt==4.2.0 # via paramiko boto3==1.35.34 # via maggma (pyproject.toml) botocore==1.35.34 # via # boto3 # s3transfer cffi==1.17.1 # via # cryptography # pynacl cryptography==43.0.1 # via paramiko dnspython==2.7.0 # via # maggma (pyproject.toml) # pymongo jmespath==1.0.1 # via # boto3 # botocore jsonlines==4.0.0 # via maggma (pyproject.toml) jsonschema==4.23.0 # via maggma (pyproject.toml) jsonschema-specifications==2023.12.1 # via jsonschema mongomock==4.2.0.post1 # via maggma (pyproject.toml) monty==2024.7.30 # via maggma (pyproject.toml) msgpack==1.1.0 # via maggma (pyproject.toml) numpy==2.1.2 # via # maggma (pyproject.toml) # pandas orjson==3.10.7 # via maggma (pyproject.toml) packaging==24.1 # via mongomock pandas==2.2.3 # via maggma (pyproject.toml) paramiko==3.5.0 # via sshtunnel pycparser==2.22 # via cffi pydantic==2.9.2 # via # maggma (pyproject.toml) # pydantic-settings pydantic-core==2.23.4 # via pydantic pydantic-settings==2.5.2 # via maggma (pyproject.toml) pydash==8.0.3 # via maggma (pyproject.toml) pymongo==4.10.1 # via maggma (pyproject.toml) pynacl==1.5.0 # via paramiko python-dateutil==2.9.0.post0 # via # botocore # maggma (pyproject.toml) # pandas python-dotenv==1.0.1 # via pydantic-settings pytz==2024.2 # via # mongomock # pandas pyzmq==26.2.0 # via maggma (pyproject.toml) referencing==0.35.1 # via # jsonschema # jsonschema-specifications rpds-py==0.20.0 # via # jsonschema # referencing ruamel-yaml==0.18.6 # via maggma (pyproject.toml) ruamel-yaml-clib==0.2.8 # via ruamel-yaml s3transfer==0.10.2 # via boto3 sentinels==1.0.0 # via mongomock six==1.16.0 # via python-dateutil sshtunnel==0.4.0 # via maggma (pyproject.toml) tqdm==4.66.5 # via maggma (pyproject.toml) typing-extensions==4.12.2 # via # pydantic # pydantic-core # pydash tzdata==2024.2 # via pandas urllib3==2.2.3 # via botocore # The following packages are considered to be unsafe in a requirements file: # setuptools maggma-0.70.0/requirements/macos-latest_py3.10_extras.txt000066400000000000000000000242641470132070100233120ustar00rootroot00000000000000# # This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile --all-extras --output-file=requirements/macos-latest_py3.10_extras.txt # aioitertools==0.12.0 # via maggma (pyproject.toml) annotated-types==0.7.0 # via pydantic anyio==4.6.0 # via # httpx # starlette asttokens==2.4.1 # via stack-data attrs==24.2.0 # via # jsonlines # jsonschema # referencing azure-core==1.31.0 # via # azure-identity # azure-storage-blob azure-identity==1.18.0 # via maggma (pyproject.toml) azure-storage-blob==12.23.1 # via maggma (pyproject.toml) babel==2.16.0 # via mkdocs-material bcrypt==4.2.0 # via paramiko blinker==1.8.2 # via flask boto3==1.35.34 # via # maggma (pyproject.toml) # moto botocore==1.35.34 # via # boto3 # moto # s3transfer certifi==2024.8.30 # via # httpcore # httpx # requests cffi==1.17.1 # via # cryptography # pynacl cfgv==3.4.0 # via pre-commit charset-normalizer==3.3.2 # via requests click==8.1.7 # via # flask # mkdocs # mkdocstrings # mongogrant # uvicorn colorama==0.4.6 # via # griffe # mkdocs-material contourpy==1.3.0 # via matplotlib coverage[toml]==7.6.1 # via pytest-cov cryptography==43.0.1 # via # azure-identity # azure-storage-blob # moto # msal # paramiko # pyjwt csscompressor==0.9.5 # via mkdocs-minify-plugin cycler==0.12.1 # via matplotlib decorator==5.1.1 # via ipython distlib==0.3.8 # via virtualenv dnspython==2.7.0 # via # maggma (pyproject.toml) # pymongo exceptiongroup==1.2.2 # via # anyio # ipython # pytest execnet==2.1.1 # via pytest-xdist executing==2.1.0 # via stack-data fastapi==0.115.0 # via maggma (pyproject.toml) fastjsonschema==2.20.0 # via nbformat filelock==3.16.1 # via virtualenv flask==3.0.3 # via mongogrant fonttools==4.54.1 # via matplotlib ghp-import==2.1.0 # via mkdocs griffe==1.3.2 # via mkdocstrings-python h11==0.14.0 # via # httpcore # uvicorn htmlmin2==0.1.13 # via mkdocs-minify-plugin httpcore==1.0.6 # via httpx httpx==0.27.2 # via starlette hvac==2.3.0 # via maggma (pyproject.toml) identify==2.6.1 # via pre-commit idna==3.10 # via # anyio # httpx # requests iniconfig==2.0.0 # via pytest ipython==8.28.0 # via maggma (pyproject.toml) isodate==0.6.1 # via azure-storage-blob itsdangerous==2.2.0 # via # flask # starlette jedi==0.19.1 # via ipython jinja2==3.1.4 # via # flask # maggma (pyproject.toml) # memray # mkdocs # mkdocs-material # mkdocstrings # moto # starlette jmespath==1.0.1 # via # boto3 # botocore joblib==1.4.2 # via pymatgen jsmin==3.0.1 # via mkdocs-minify-plugin jsonlines==4.0.0 # via maggma (pyproject.toml) jsonschema==4.23.0 # via # maggma (pyproject.toml) # nbformat jsonschema-specifications==2023.12.1 # via jsonschema jupyter-core==5.7.2 # via nbformat kiwisolver==1.4.7 # via matplotlib latexcodec==3.0.0 # via pybtex linkify-it-py==2.0.3 # via markdown-it-py markdown==3.7 # via # mkdocs # mkdocs-autorefs # mkdocs-material # mkdocstrings # pymdown-extensions markdown-it-py[linkify,plugins]==3.0.0 # via # mdit-py-plugins # rich # textual markupsafe==2.1.5 # via # jinja2 # mkdocs # mkdocs-autorefs # mkdocstrings # werkzeug matplotlib==3.9.2 # via pymatgen matplotlib-inline==0.1.7 # via ipython mdit-py-plugins==0.4.2 # via markdown-it-py mdurl==0.1.2 # via markdown-it-py memray==1.14.0 # via maggma (pyproject.toml) mergedeep==1.3.4 # via # mkdocs # mkdocs-get-deps mkdocs==1.6.1 # via # maggma (pyproject.toml) # mkdocs-autorefs # mkdocs-material # mkdocs-minify-plugin # mkdocstrings mkdocs-autorefs==1.2.0 # via # mkdocstrings # mkdocstrings-python mkdocs-get-deps==0.2.0 # via mkdocs mkdocs-material==9.5.39 # via maggma (pyproject.toml) mkdocs-material-extensions==1.3.1 # via mkdocs-material mkdocs-minify-plugin==0.8.0 # via maggma (pyproject.toml) mkdocstrings[python]==0.26.1 # via # maggma (pyproject.toml) # mkdocstrings-python mkdocstrings-python==1.11.1 # via mkdocstrings mongogrant==0.3.3 # via maggma (pyproject.toml) mongomock==4.2.0.post1 # via maggma (pyproject.toml) monty==2024.7.30 # via # maggma (pyproject.toml) # pymatgen montydb==2.5.3 # via maggma (pyproject.toml) moto==5.0.16 # via maggma (pyproject.toml) mpmath==1.3.0 # via sympy msal==1.31.0 # via # azure-identity # msal-extensions msal-extensions==1.2.0 # via azure-identity msgpack==1.1.0 # via maggma (pyproject.toml) nbformat==5.10.4 # via maggma (pyproject.toml) networkx==3.3 # via pymatgen nodeenv==1.9.1 # via pre-commit numpy==2.1.2 # via # contourpy # maggma (pyproject.toml) # matplotlib # pandas # pymatgen # scipy # spglib orjson==3.10.7 # via maggma (pyproject.toml) packaging==24.1 # via # matplotlib # mkdocs # mongomock # plotly # pytest paginate==0.5.7 # via mkdocs-material palettable==3.3.3 # via pymatgen pandas==2.2.3 # via # maggma (pyproject.toml) # pymatgen paramiko==3.5.0 # via sshtunnel parso==0.8.4 # via jedi pathspec==0.12.1 # via mkdocs pexpect==4.9.0 # via ipython pillow==10.4.0 # via matplotlib platformdirs==4.3.6 # via # jupyter-core # mkdocs-get-deps # mkdocstrings # textual # virtualenv plotly==5.24.1 # via pymatgen pluggy==1.5.0 # via pytest portalocker==2.10.1 # via msal-extensions pre-commit==4.0.0 # via maggma (pyproject.toml) prompt-toolkit==3.0.48 # via ipython ptyprocess==0.7.0 # via pexpect pure-eval==0.2.3 # via stack-data pybtex==0.24.0 # via pymatgen pycparser==2.22 # via cffi pydantic==2.9.2 # via # fastapi # maggma (pyproject.toml) # pydantic-settings pydantic-core==2.23.4 # via pydantic pydantic-settings==2.5.2 # via maggma (pyproject.toml) pydash==8.0.3 # via maggma (pyproject.toml) pygments==2.18.0 # via # ipython # mkdocs-material # rich pyjwt[crypto]==2.9.0 # via # msal # pyjwt pymatgen==2024.10.3 # via maggma (pyproject.toml) pymdown-extensions==10.11.2 # via # mkdocs-material # mkdocstrings pymongo==4.10.1 # via # maggma (pyproject.toml) # mongogrant pynacl==1.5.0 # via paramiko pyparsing==3.1.4 # via matplotlib pytest==8.3.3 # via # maggma (pyproject.toml) # pytest-asyncio # pytest-cov # pytest-mock # pytest-xdist pytest-asyncio==0.24.0 # via maggma (pyproject.toml) pytest-cov==5.0.0 # via maggma (pyproject.toml) pytest-mock==3.14.0 # via maggma (pyproject.toml) pytest-xdist==3.6.1 # via maggma (pyproject.toml) python-dateutil==2.9.0.post0 # via # botocore # ghp-import # maggma (pyproject.toml) # matplotlib # moto # pandas python-dotenv==1.0.1 # via pydantic-settings python-multipart==0.0.12 # via starlette pytz==2024.2 # via # mongomock # pandas pyyaml==6.0.2 # via # mkdocs # mkdocs-get-deps # pre-commit # pybtex # pymdown-extensions # pyyaml-env-tag # starlette pyyaml-env-tag==0.1 # via mkdocs pyzmq==26.2.0 # via maggma (pyproject.toml) referencing==0.35.1 # via # jsonschema # jsonschema-specifications regex==2024.9.11 # via # maggma (pyproject.toml) # mkdocs-material requests==2.32.3 # via # azure-core # hvac # mkdocs-material # mongogrant # moto # msal # pymatgen # responses responses==0.21.0 # via # maggma (pyproject.toml) # moto rich==13.9.2 # via # memray # textual rpds-py==0.20.0 # via # jsonschema # referencing ruamel-yaml==0.18.6 # via # maggma (pyproject.toml) # pymatgen ruamel-yaml-clib==0.2.8 # via ruamel-yaml ruff==0.6.9 # via maggma (pyproject.toml) s3transfer==0.10.2 # via boto3 scipy==1.14.1 # via pymatgen sentinels==1.0.0 # via mongomock six==1.16.0 # via # asttokens # azure-core # isodate # pybtex # python-dateutil sniffio==1.3.1 # via # anyio # httpx spglib==2.5.0 # via pymatgen sshtunnel==0.4.0 # via maggma (pyproject.toml) stack-data==0.6.3 # via ipython starlette[full]==0.38.6 # via # fastapi # maggma (pyproject.toml) sympy==1.13.3 # via pymatgen tabulate==0.9.0 # via pymatgen tenacity==9.0.0 # via plotly textual==0.82.0 # via memray tomli==2.0.2 # via # coverage # pytest tqdm==4.66.5 # via # maggma (pyproject.toml) # pymatgen traitlets==5.14.3 # via # ipython # jupyter-core # matplotlib-inline # nbformat types-python-dateutil==2.9.0.20241003 # via maggma (pyproject.toml) types-pyyaml==6.0.12.20240917 # via maggma (pyproject.toml) types-setuptools==75.1.0.20240917 # via maggma (pyproject.toml) typing-extensions==4.12.2 # via # anyio # azure-core # azure-identity # azure-storage-blob # fastapi # ipython # pydantic # pydantic-core # pydash # rich # textual # uvicorn tzdata==2024.2 # via pandas uc-micro-py==1.0.3 # via linkify-it-py uncertainties==3.2.2 # via pymatgen urllib3==2.2.3 # via # botocore # requests # responses uvicorn==0.31.0 # via maggma (pyproject.toml) virtualenv==20.26.6 # via pre-commit watchdog==5.0.3 # via mkdocs wcwidth==0.2.13 # via prompt-toolkit werkzeug==3.0.4 # via # flask # moto xmltodict==0.13.0 # via moto # The following packages are considered to be unsafe in a requirements file: # setuptools maggma-0.70.0/requirements/macos-latest_py3.11.txt000066400000000000000000000050221470132070100217140ustar00rootroot00000000000000# # This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile --output-file=requirements/macos-latest_py3.11.txt # aioitertools==0.12.0 # via maggma (pyproject.toml) annotated-types==0.7.0 # via pydantic attrs==24.2.0 # via # jsonlines # jsonschema # referencing bcrypt==4.2.0 # via paramiko boto3==1.35.34 # via maggma (pyproject.toml) botocore==1.35.34 # via # boto3 # s3transfer cffi==1.17.1 # via # cryptography # pynacl cryptography==43.0.1 # via paramiko dnspython==2.7.0 # via # maggma (pyproject.toml) # pymongo jmespath==1.0.1 # via # boto3 # botocore jsonlines==4.0.0 # via maggma (pyproject.toml) jsonschema==4.23.0 # via maggma (pyproject.toml) jsonschema-specifications==2023.12.1 # via jsonschema mongomock==4.2.0.post1 # via maggma (pyproject.toml) monty==2024.7.30 # via maggma (pyproject.toml) msgpack==1.1.0 # via maggma (pyproject.toml) numpy==2.1.2 # via # maggma (pyproject.toml) # pandas orjson==3.10.7 # via maggma (pyproject.toml) packaging==24.1 # via mongomock pandas==2.2.3 # via maggma (pyproject.toml) paramiko==3.5.0 # via sshtunnel pycparser==2.22 # via cffi pydantic==2.9.2 # via # maggma (pyproject.toml) # pydantic-settings pydantic-core==2.23.4 # via pydantic pydantic-settings==2.5.2 # via maggma (pyproject.toml) pydash==8.0.3 # via maggma (pyproject.toml) pymongo==4.10.1 # via maggma (pyproject.toml) pynacl==1.5.0 # via paramiko python-dateutil==2.9.0.post0 # via # botocore # maggma (pyproject.toml) # pandas python-dotenv==1.0.1 # via pydantic-settings pytz==2024.2 # via # mongomock # pandas pyzmq==26.2.0 # via maggma (pyproject.toml) referencing==0.35.1 # via # jsonschema # jsonschema-specifications rpds-py==0.20.0 # via # jsonschema # referencing ruamel-yaml==0.18.6 # via maggma (pyproject.toml) ruamel-yaml-clib==0.2.8 # via ruamel-yaml s3transfer==0.10.2 # via boto3 sentinels==1.0.0 # via mongomock six==1.16.0 # via python-dateutil sshtunnel==0.4.0 # via maggma (pyproject.toml) tqdm==4.66.5 # via maggma (pyproject.toml) typing-extensions==4.12.2 # via # pydantic # pydantic-core # pydash tzdata==2024.2 # via pandas urllib3==2.2.3 # via botocore # The following packages are considered to be unsafe in a requirements file: # setuptools maggma-0.70.0/requirements/macos-latest_py3.11_extras.txt000066400000000000000000000240051470132070100233040ustar00rootroot00000000000000# # This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile --all-extras --output-file=requirements/macos-latest_py3.11_extras.txt # aioitertools==0.12.0 # via maggma (pyproject.toml) annotated-types==0.7.0 # via pydantic anyio==4.6.0 # via # httpx # starlette asttokens==2.4.1 # via stack-data attrs==24.2.0 # via # jsonlines # jsonschema # referencing azure-core==1.31.0 # via # azure-identity # azure-storage-blob azure-identity==1.18.0 # via maggma (pyproject.toml) azure-storage-blob==12.23.1 # via maggma (pyproject.toml) babel==2.16.0 # via mkdocs-material bcrypt==4.2.0 # via paramiko blinker==1.8.2 # via flask boto3==1.35.34 # via # maggma (pyproject.toml) # moto botocore==1.35.34 # via # boto3 # moto # s3transfer certifi==2024.8.30 # via # httpcore # httpx # requests cffi==1.17.1 # via # cryptography # pynacl cfgv==3.4.0 # via pre-commit charset-normalizer==3.3.2 # via requests click==8.1.7 # via # flask # mkdocs # mkdocstrings # mongogrant # uvicorn colorama==0.4.6 # via # griffe # mkdocs-material contourpy==1.3.0 # via matplotlib coverage[toml]==7.6.1 # via pytest-cov cryptography==43.0.1 # via # azure-identity # azure-storage-blob # moto # msal # paramiko # pyjwt csscompressor==0.9.5 # via mkdocs-minify-plugin cycler==0.12.1 # via matplotlib decorator==5.1.1 # via ipython distlib==0.3.8 # via virtualenv dnspython==2.7.0 # via # maggma (pyproject.toml) # pymongo execnet==2.1.1 # via pytest-xdist executing==2.1.0 # via stack-data fastapi==0.115.0 # via maggma (pyproject.toml) fastjsonschema==2.20.0 # via nbformat filelock==3.16.1 # via virtualenv flask==3.0.3 # via mongogrant fonttools==4.54.1 # via matplotlib ghp-import==2.1.0 # via mkdocs griffe==1.3.2 # via mkdocstrings-python h11==0.14.0 # via # httpcore # uvicorn htmlmin2==0.1.13 # via mkdocs-minify-plugin httpcore==1.0.6 # via httpx httpx==0.27.2 # via starlette hvac==2.3.0 # via maggma (pyproject.toml) identify==2.6.1 # via pre-commit idna==3.10 # via # anyio # httpx # requests iniconfig==2.0.0 # via pytest ipython==8.28.0 # via maggma (pyproject.toml) isodate==0.6.1 # via azure-storage-blob itsdangerous==2.2.0 # via # flask # starlette jedi==0.19.1 # via ipython jinja2==3.1.4 # via # flask # maggma (pyproject.toml) # memray # mkdocs # mkdocs-material # mkdocstrings # moto # starlette jmespath==1.0.1 # via # boto3 # botocore joblib==1.4.2 # via pymatgen jsmin==3.0.1 # via mkdocs-minify-plugin jsonlines==4.0.0 # via maggma (pyproject.toml) jsonschema==4.23.0 # via # maggma (pyproject.toml) # nbformat jsonschema-specifications==2023.12.1 # via jsonschema jupyter-core==5.7.2 # via nbformat kiwisolver==1.4.7 # via matplotlib latexcodec==3.0.0 # via pybtex linkify-it-py==2.0.3 # via markdown-it-py markdown==3.7 # via # mkdocs # mkdocs-autorefs # mkdocs-material # mkdocstrings # pymdown-extensions markdown-it-py[linkify,plugins]==3.0.0 # via # mdit-py-plugins # rich # textual markupsafe==2.1.5 # via # jinja2 # mkdocs # mkdocs-autorefs # mkdocstrings # werkzeug matplotlib==3.9.2 # via pymatgen matplotlib-inline==0.1.7 # via ipython mdit-py-plugins==0.4.2 # via markdown-it-py mdurl==0.1.2 # via markdown-it-py memray==1.14.0 # via maggma (pyproject.toml) mergedeep==1.3.4 # via # mkdocs # mkdocs-get-deps mkdocs==1.6.1 # via # maggma (pyproject.toml) # mkdocs-autorefs # mkdocs-material # mkdocs-minify-plugin # mkdocstrings mkdocs-autorefs==1.2.0 # via # mkdocstrings # mkdocstrings-python mkdocs-get-deps==0.2.0 # via mkdocs mkdocs-material==9.5.39 # via maggma (pyproject.toml) mkdocs-material-extensions==1.3.1 # via mkdocs-material mkdocs-minify-plugin==0.8.0 # via maggma (pyproject.toml) mkdocstrings[python]==0.26.1 # via # maggma (pyproject.toml) # mkdocstrings-python mkdocstrings-python==1.11.1 # via mkdocstrings mongogrant==0.3.3 # via maggma (pyproject.toml) mongomock==4.2.0.post1 # via maggma (pyproject.toml) monty==2024.7.30 # via # maggma (pyproject.toml) # pymatgen montydb==2.5.3 # via maggma (pyproject.toml) moto==5.0.16 # via maggma (pyproject.toml) mpmath==1.3.0 # via sympy msal==1.31.0 # via # azure-identity # msal-extensions msal-extensions==1.2.0 # via azure-identity msgpack==1.1.0 # via maggma (pyproject.toml) nbformat==5.10.4 # via maggma (pyproject.toml) networkx==3.3 # via pymatgen nodeenv==1.9.1 # via pre-commit numpy==2.1.2 # via # contourpy # maggma (pyproject.toml) # matplotlib # pandas # pymatgen # scipy # spglib orjson==3.10.7 # via maggma (pyproject.toml) packaging==24.1 # via # matplotlib # mkdocs # mongomock # plotly # pytest paginate==0.5.7 # via mkdocs-material palettable==3.3.3 # via pymatgen pandas==2.2.3 # via # maggma (pyproject.toml) # pymatgen paramiko==3.5.0 # via sshtunnel parso==0.8.4 # via jedi pathspec==0.12.1 # via mkdocs pexpect==4.9.0 # via ipython pillow==10.4.0 # via matplotlib platformdirs==4.3.6 # via # jupyter-core # mkdocs-get-deps # mkdocstrings # textual # virtualenv plotly==5.24.1 # via pymatgen pluggy==1.5.0 # via pytest portalocker==2.10.1 # via msal-extensions pre-commit==4.0.0 # via maggma (pyproject.toml) prompt-toolkit==3.0.48 # via ipython ptyprocess==0.7.0 # via pexpect pure-eval==0.2.3 # via stack-data pybtex==0.24.0 # via pymatgen pycparser==2.22 # via cffi pydantic==2.9.2 # via # fastapi # maggma (pyproject.toml) # pydantic-settings pydantic-core==2.23.4 # via pydantic pydantic-settings==2.5.2 # via maggma (pyproject.toml) pydash==8.0.3 # via maggma (pyproject.toml) pygments==2.18.0 # via # ipython # mkdocs-material # rich pyjwt[crypto]==2.9.0 # via # msal # pyjwt pymatgen==2024.10.3 # via maggma (pyproject.toml) pymdown-extensions==10.11.2 # via # mkdocs-material # mkdocstrings pymongo==4.10.1 # via # maggma (pyproject.toml) # mongogrant pynacl==1.5.0 # via paramiko pyparsing==3.1.4 # via matplotlib pytest==8.3.3 # via # maggma (pyproject.toml) # pytest-asyncio # pytest-cov # pytest-mock # pytest-xdist pytest-asyncio==0.24.0 # via maggma (pyproject.toml) pytest-cov==5.0.0 # via maggma (pyproject.toml) pytest-mock==3.14.0 # via maggma (pyproject.toml) pytest-xdist==3.6.1 # via maggma (pyproject.toml) python-dateutil==2.9.0.post0 # via # botocore # ghp-import # maggma (pyproject.toml) # matplotlib # moto # pandas python-dotenv==1.0.1 # via pydantic-settings python-multipart==0.0.12 # via starlette pytz==2024.2 # via # mongomock # pandas pyyaml==6.0.2 # via # mkdocs # mkdocs-get-deps # pre-commit # pybtex # pymdown-extensions # pyyaml-env-tag # starlette pyyaml-env-tag==0.1 # via mkdocs pyzmq==26.2.0 # via maggma (pyproject.toml) referencing==0.35.1 # via # jsonschema # jsonschema-specifications regex==2024.9.11 # via # maggma (pyproject.toml) # mkdocs-material requests==2.32.3 # via # azure-core # hvac # mkdocs-material # mongogrant # moto # msal # pymatgen # responses responses==0.21.0 # via # maggma (pyproject.toml) # moto rich==13.9.2 # via # memray # textual rpds-py==0.20.0 # via # jsonschema # referencing ruamel-yaml==0.18.6 # via # maggma (pyproject.toml) # pymatgen ruamel-yaml-clib==0.2.8 # via ruamel-yaml ruff==0.6.9 # via maggma (pyproject.toml) s3transfer==0.10.2 # via boto3 scipy==1.14.1 # via pymatgen sentinels==1.0.0 # via mongomock six==1.16.0 # via # asttokens # azure-core # isodate # pybtex # python-dateutil sniffio==1.3.1 # via # anyio # httpx spglib==2.5.0 # via pymatgen sshtunnel==0.4.0 # via maggma (pyproject.toml) stack-data==0.6.3 # via ipython starlette[full]==0.38.6 # via # fastapi # maggma (pyproject.toml) sympy==1.13.3 # via pymatgen tabulate==0.9.0 # via pymatgen tenacity==9.0.0 # via plotly textual==0.82.0 # via memray tqdm==4.66.5 # via # maggma (pyproject.toml) # pymatgen traitlets==5.14.3 # via # ipython # jupyter-core # matplotlib-inline # nbformat types-python-dateutil==2.9.0.20241003 # via maggma (pyproject.toml) types-pyyaml==6.0.12.20240917 # via maggma (pyproject.toml) types-setuptools==75.1.0.20240917 # via maggma (pyproject.toml) typing-extensions==4.12.2 # via # azure-core # azure-identity # azure-storage-blob # fastapi # ipython # pydantic # pydantic-core # pydash # textual tzdata==2024.2 # via pandas uc-micro-py==1.0.3 # via linkify-it-py uncertainties==3.2.2 # via pymatgen urllib3==2.2.3 # via # botocore # requests # responses uvicorn==0.31.0 # via maggma (pyproject.toml) virtualenv==20.26.6 # via pre-commit watchdog==5.0.3 # via mkdocs wcwidth==0.2.13 # via prompt-toolkit werkzeug==3.0.4 # via # flask # moto xmltodict==0.13.0 # via moto # The following packages are considered to be unsafe in a requirements file: # setuptools maggma-0.70.0/requirements/macos-latest_py3.12.txt000066400000000000000000000050221470132070100217150ustar00rootroot00000000000000# # This file is autogenerated by pip-compile with Python 3.12 # by the following command: # # pip-compile --output-file=requirements/macos-latest_py3.12.txt # aioitertools==0.12.0 # via maggma (pyproject.toml) annotated-types==0.7.0 # via pydantic attrs==24.2.0 # via # jsonlines # jsonschema # referencing bcrypt==4.2.0 # via paramiko boto3==1.35.34 # via maggma (pyproject.toml) botocore==1.35.34 # via # boto3 # s3transfer cffi==1.17.1 # via # cryptography # pynacl cryptography==43.0.1 # via paramiko dnspython==2.7.0 # via # maggma (pyproject.toml) # pymongo jmespath==1.0.1 # via # boto3 # botocore jsonlines==4.0.0 # via maggma (pyproject.toml) jsonschema==4.23.0 # via maggma (pyproject.toml) jsonschema-specifications==2023.12.1 # via jsonschema mongomock==4.2.0.post1 # via maggma (pyproject.toml) monty==2024.7.30 # via maggma (pyproject.toml) msgpack==1.1.0 # via maggma (pyproject.toml) numpy==2.1.2 # via # maggma (pyproject.toml) # pandas orjson==3.10.7 # via maggma (pyproject.toml) packaging==24.1 # via mongomock pandas==2.2.3 # via maggma (pyproject.toml) paramiko==3.5.0 # via sshtunnel pycparser==2.22 # via cffi pydantic==2.9.2 # via # maggma (pyproject.toml) # pydantic-settings pydantic-core==2.23.4 # via pydantic pydantic-settings==2.5.2 # via maggma (pyproject.toml) pydash==8.0.3 # via maggma (pyproject.toml) pymongo==4.10.1 # via maggma (pyproject.toml) pynacl==1.5.0 # via paramiko python-dateutil==2.9.0.post0 # via # botocore # maggma (pyproject.toml) # pandas python-dotenv==1.0.1 # via pydantic-settings pytz==2024.2 # via # mongomock # pandas pyzmq==26.2.0 # via maggma (pyproject.toml) referencing==0.35.1 # via # jsonschema # jsonschema-specifications rpds-py==0.20.0 # via # jsonschema # referencing ruamel-yaml==0.18.6 # via maggma (pyproject.toml) ruamel-yaml-clib==0.2.8 # via ruamel-yaml s3transfer==0.10.2 # via boto3 sentinels==1.0.0 # via mongomock six==1.16.0 # via python-dateutil sshtunnel==0.4.0 # via maggma (pyproject.toml) tqdm==4.66.5 # via maggma (pyproject.toml) typing-extensions==4.12.2 # via # pydantic # pydantic-core # pydash tzdata==2024.2 # via pandas urllib3==2.2.3 # via botocore # The following packages are considered to be unsafe in a requirements file: # setuptools maggma-0.70.0/requirements/macos-latest_py3.12_extras.txt000066400000000000000000000237651470132070100233210ustar00rootroot00000000000000# # This file is autogenerated by pip-compile with Python 3.12 # by the following command: # # pip-compile --all-extras --output-file=requirements/macos-latest_py3.12_extras.txt # aioitertools==0.12.0 # via maggma (pyproject.toml) annotated-types==0.7.0 # via pydantic anyio==4.6.0 # via # httpx # starlette asttokens==2.4.1 # via stack-data attrs==24.2.0 # via # jsonlines # jsonschema # referencing azure-core==1.31.0 # via # azure-identity # azure-storage-blob azure-identity==1.18.0 # via maggma (pyproject.toml) azure-storage-blob==12.23.1 # via maggma (pyproject.toml) babel==2.16.0 # via mkdocs-material bcrypt==4.2.0 # via paramiko blinker==1.8.2 # via flask boto3==1.35.34 # via # maggma (pyproject.toml) # moto botocore==1.35.34 # via # boto3 # moto # s3transfer certifi==2024.8.30 # via # httpcore # httpx # requests cffi==1.17.1 # via # cryptography # pynacl cfgv==3.4.0 # via pre-commit charset-normalizer==3.3.2 # via requests click==8.1.7 # via # flask # mkdocs # mkdocstrings # mongogrant # uvicorn colorama==0.4.6 # via # griffe # mkdocs-material contourpy==1.3.0 # via matplotlib coverage[toml]==7.6.1 # via pytest-cov cryptography==43.0.1 # via # azure-identity # azure-storage-blob # moto # msal # paramiko # pyjwt csscompressor==0.9.5 # via mkdocs-minify-plugin cycler==0.12.1 # via matplotlib decorator==5.1.1 # via ipython distlib==0.3.8 # via virtualenv dnspython==2.7.0 # via # maggma (pyproject.toml) # pymongo execnet==2.1.1 # via pytest-xdist executing==2.1.0 # via stack-data fastapi==0.115.0 # via maggma (pyproject.toml) fastjsonschema==2.20.0 # via nbformat filelock==3.16.1 # via virtualenv flask==3.0.3 # via mongogrant fonttools==4.54.1 # via matplotlib ghp-import==2.1.0 # via mkdocs griffe==1.3.2 # via mkdocstrings-python h11==0.14.0 # via # httpcore # uvicorn htmlmin2==0.1.13 # via mkdocs-minify-plugin httpcore==1.0.6 # via httpx httpx==0.27.2 # via starlette hvac==2.3.0 # via maggma (pyproject.toml) identify==2.6.1 # via pre-commit idna==3.10 # via # anyio # httpx # requests iniconfig==2.0.0 # via pytest ipython==8.28.0 # via maggma (pyproject.toml) isodate==0.6.1 # via azure-storage-blob itsdangerous==2.2.0 # via # flask # starlette jedi==0.19.1 # via ipython jinja2==3.1.4 # via # flask # maggma (pyproject.toml) # memray # mkdocs # mkdocs-material # mkdocstrings # moto # starlette jmespath==1.0.1 # via # boto3 # botocore joblib==1.4.2 # via pymatgen jsmin==3.0.1 # via mkdocs-minify-plugin jsonlines==4.0.0 # via maggma (pyproject.toml) jsonschema==4.23.0 # via # maggma (pyproject.toml) # nbformat jsonschema-specifications==2023.12.1 # via jsonschema jupyter-core==5.7.2 # via nbformat kiwisolver==1.4.7 # via matplotlib latexcodec==3.0.0 # via pybtex linkify-it-py==2.0.3 # via markdown-it-py markdown==3.7 # via # mkdocs # mkdocs-autorefs # mkdocs-material # mkdocstrings # pymdown-extensions markdown-it-py[linkify,plugins]==3.0.0 # via # mdit-py-plugins # rich # textual markupsafe==2.1.5 # via # jinja2 # mkdocs # mkdocs-autorefs # mkdocstrings # werkzeug matplotlib==3.9.2 # via pymatgen matplotlib-inline==0.1.7 # via ipython mdit-py-plugins==0.4.2 # via markdown-it-py mdurl==0.1.2 # via markdown-it-py memray==1.14.0 # via maggma (pyproject.toml) mergedeep==1.3.4 # via # mkdocs # mkdocs-get-deps mkdocs==1.6.1 # via # maggma (pyproject.toml) # mkdocs-autorefs # mkdocs-material # mkdocs-minify-plugin # mkdocstrings mkdocs-autorefs==1.2.0 # via # mkdocstrings # mkdocstrings-python mkdocs-get-deps==0.2.0 # via mkdocs mkdocs-material==9.5.39 # via maggma (pyproject.toml) mkdocs-material-extensions==1.3.1 # via mkdocs-material mkdocs-minify-plugin==0.8.0 # via maggma (pyproject.toml) mkdocstrings[python]==0.26.1 # via # maggma (pyproject.toml) # mkdocstrings-python mkdocstrings-python==1.11.1 # via mkdocstrings mongogrant==0.3.3 # via maggma (pyproject.toml) mongomock==4.2.0.post1 # via maggma (pyproject.toml) monty==2024.7.30 # via # maggma (pyproject.toml) # pymatgen montydb==2.5.3 # via maggma (pyproject.toml) moto==5.0.16 # via maggma (pyproject.toml) mpmath==1.3.0 # via sympy msal==1.31.0 # via # azure-identity # msal-extensions msal-extensions==1.2.0 # via azure-identity msgpack==1.1.0 # via maggma (pyproject.toml) nbformat==5.10.4 # via maggma (pyproject.toml) networkx==3.3 # via pymatgen nodeenv==1.9.1 # via pre-commit numpy==2.1.2 # via # contourpy # maggma (pyproject.toml) # matplotlib # pandas # pymatgen # scipy # spglib orjson==3.10.7 # via maggma (pyproject.toml) packaging==24.1 # via # matplotlib # mkdocs # mongomock # plotly # pytest paginate==0.5.7 # via mkdocs-material palettable==3.3.3 # via pymatgen pandas==2.2.3 # via # maggma (pyproject.toml) # pymatgen paramiko==3.5.0 # via sshtunnel parso==0.8.4 # via jedi pathspec==0.12.1 # via mkdocs pexpect==4.9.0 # via ipython pillow==10.4.0 # via matplotlib platformdirs==4.3.6 # via # jupyter-core # mkdocs-get-deps # mkdocstrings # textual # virtualenv plotly==5.24.1 # via pymatgen pluggy==1.5.0 # via pytest portalocker==2.10.1 # via msal-extensions pre-commit==4.0.0 # via maggma (pyproject.toml) prompt-toolkit==3.0.48 # via ipython ptyprocess==0.7.0 # via pexpect pure-eval==0.2.3 # via stack-data pybtex==0.24.0 # via pymatgen pycparser==2.22 # via cffi pydantic==2.9.2 # via # fastapi # maggma (pyproject.toml) # pydantic-settings pydantic-core==2.23.4 # via pydantic pydantic-settings==2.5.2 # via maggma (pyproject.toml) pydash==8.0.3 # via maggma (pyproject.toml) pygments==2.18.0 # via # ipython # mkdocs-material # rich pyjwt[crypto]==2.9.0 # via # msal # pyjwt pymatgen==2024.10.3 # via maggma (pyproject.toml) pymdown-extensions==10.11.2 # via # mkdocs-material # mkdocstrings pymongo==4.10.1 # via # maggma (pyproject.toml) # mongogrant pynacl==1.5.0 # via paramiko pyparsing==3.1.4 # via matplotlib pytest==8.3.3 # via # maggma (pyproject.toml) # pytest-asyncio # pytest-cov # pytest-mock # pytest-xdist pytest-asyncio==0.24.0 # via maggma (pyproject.toml) pytest-cov==5.0.0 # via maggma (pyproject.toml) pytest-mock==3.14.0 # via maggma (pyproject.toml) pytest-xdist==3.6.1 # via maggma (pyproject.toml) python-dateutil==2.9.0.post0 # via # botocore # ghp-import # maggma (pyproject.toml) # matplotlib # moto # pandas python-dotenv==1.0.1 # via pydantic-settings python-multipart==0.0.12 # via starlette pytz==2024.2 # via # mongomock # pandas pyyaml==6.0.2 # via # mkdocs # mkdocs-get-deps # pre-commit # pybtex # pymdown-extensions # pyyaml-env-tag # starlette pyyaml-env-tag==0.1 # via mkdocs pyzmq==26.2.0 # via maggma (pyproject.toml) referencing==0.35.1 # via # jsonschema # jsonschema-specifications regex==2024.9.11 # via # maggma (pyproject.toml) # mkdocs-material requests==2.32.3 # via # azure-core # hvac # mkdocs-material # mongogrant # moto # msal # pymatgen # responses responses==0.21.0 # via # maggma (pyproject.toml) # moto rich==13.9.2 # via # memray # textual rpds-py==0.20.0 # via # jsonschema # referencing ruamel-yaml==0.18.6 # via # maggma (pyproject.toml) # pymatgen ruamel-yaml-clib==0.2.8 # via ruamel-yaml ruff==0.6.9 # via maggma (pyproject.toml) s3transfer==0.10.2 # via boto3 scipy==1.14.1 # via pymatgen sentinels==1.0.0 # via mongomock six==1.16.0 # via # asttokens # azure-core # isodate # pybtex # python-dateutil sniffio==1.3.1 # via # anyio # httpx spglib==2.5.0 # via pymatgen sshtunnel==0.4.0 # via maggma (pyproject.toml) stack-data==0.6.3 # via ipython starlette[full]==0.38.6 # via # fastapi # maggma (pyproject.toml) sympy==1.13.3 # via pymatgen tabulate==0.9.0 # via pymatgen tenacity==9.0.0 # via plotly textual==0.82.0 # via memray tqdm==4.66.5 # via # maggma (pyproject.toml) # pymatgen traitlets==5.14.3 # via # ipython # jupyter-core # matplotlib-inline # nbformat types-python-dateutil==2.9.0.20241003 # via maggma (pyproject.toml) types-pyyaml==6.0.12.20240917 # via maggma (pyproject.toml) types-setuptools==75.1.0.20240917 # via maggma (pyproject.toml) typing-extensions==4.12.2 # via # azure-core # azure-identity # azure-storage-blob # fastapi # pydantic # pydantic-core # pydash # textual tzdata==2024.2 # via pandas uc-micro-py==1.0.3 # via linkify-it-py uncertainties==3.2.2 # via pymatgen urllib3==2.2.3 # via # botocore # requests # responses uvicorn==0.31.0 # via maggma (pyproject.toml) virtualenv==20.26.6 # via pre-commit watchdog==5.0.3 # via mkdocs wcwidth==0.2.13 # via prompt-toolkit werkzeug==3.0.4 # via # flask # moto xmltodict==0.13.0 # via moto # The following packages are considered to be unsafe in a requirements file: # setuptools maggma-0.70.0/requirements/macos-latest_py3.9.txt000066400000000000000000000050471470132070100216520ustar00rootroot00000000000000# # This file is autogenerated by pip-compile with Python 3.9 # by the following command: # # pip-compile --output-file=requirements/macos-latest_py3.9.txt # aioitertools==0.12.0 # via maggma (pyproject.toml) annotated-types==0.7.0 # via pydantic attrs==24.2.0 # via # jsonlines # jsonschema # referencing bcrypt==4.2.0 # via paramiko boto3==1.35.34 # via maggma (pyproject.toml) botocore==1.35.34 # via # boto3 # s3transfer cffi==1.17.1 # via # cryptography # pynacl cryptography==43.0.1 # via paramiko dnspython==2.7.0 # via # maggma (pyproject.toml) # pymongo jmespath==1.0.1 # via # boto3 # botocore jsonlines==4.0.0 # via maggma (pyproject.toml) jsonschema==4.23.0 # via maggma (pyproject.toml) jsonschema-specifications==2023.12.1 # via jsonschema mongomock==4.2.0.post1 # via maggma (pyproject.toml) monty==2024.7.30 # via maggma (pyproject.toml) msgpack==1.1.0 # via maggma (pyproject.toml) numpy==2.0.2 # via # maggma (pyproject.toml) # pandas orjson==3.10.7 # via maggma (pyproject.toml) packaging==24.1 # via mongomock pandas==2.2.3 # via maggma (pyproject.toml) paramiko==3.5.0 # via sshtunnel pycparser==2.22 # via cffi pydantic==2.9.2 # via # maggma (pyproject.toml) # pydantic-settings pydantic-core==2.23.4 # via pydantic pydantic-settings==2.5.2 # via maggma (pyproject.toml) pydash==8.0.3 # via maggma (pyproject.toml) pymongo==4.10.1 # via maggma (pyproject.toml) pynacl==1.5.0 # via paramiko python-dateutil==2.9.0.post0 # via # botocore # maggma (pyproject.toml) # pandas python-dotenv==1.0.1 # via pydantic-settings pytz==2024.2 # via # mongomock # pandas pyzmq==26.2.0 # via maggma (pyproject.toml) referencing==0.35.1 # via # jsonschema # jsonschema-specifications rpds-py==0.20.0 # via # jsonschema # referencing ruamel-yaml==0.18.6 # via maggma (pyproject.toml) ruamel-yaml-clib==0.2.8 # via ruamel-yaml s3transfer==0.10.2 # via boto3 sentinels==1.0.0 # via mongomock six==1.16.0 # via python-dateutil sshtunnel==0.4.0 # via maggma (pyproject.toml) tqdm==4.66.5 # via maggma (pyproject.toml) typing-extensions==4.12.2 # via # aioitertools # pydantic # pydantic-core # pydash tzdata==2024.2 # via pandas urllib3==1.26.20 # via botocore # The following packages are considered to be unsafe in a requirements file: # setuptools maggma-0.70.0/requirements/macos-latest_py3.9_extras.txt000066400000000000000000000250241470132070100232350ustar00rootroot00000000000000# # This file is autogenerated by pip-compile with Python 3.9 # by the following command: # # pip-compile --all-extras --output-file=requirements/macos-latest_py3.9_extras.txt # aioitertools==0.12.0 # via maggma (pyproject.toml) annotated-types==0.7.0 # via pydantic anyio==4.6.0 # via # httpx # starlette asttokens==2.4.1 # via stack-data attrs==24.2.0 # via # jsonlines # jsonschema # referencing azure-core==1.31.0 # via # azure-identity # azure-storage-blob azure-identity==1.18.0 # via maggma (pyproject.toml) azure-storage-blob==12.23.1 # via maggma (pyproject.toml) babel==2.16.0 # via mkdocs-material bcrypt==4.2.0 # via paramiko blinker==1.8.2 # via flask boto3==1.35.34 # via # maggma (pyproject.toml) # moto botocore==1.35.34 # via # boto3 # moto # s3transfer certifi==2024.8.30 # via # httpcore # httpx # requests cffi==1.17.1 # via # cryptography # pynacl cfgv==3.4.0 # via pre-commit charset-normalizer==3.3.2 # via requests click==8.1.7 # via # flask # mkdocs # mkdocstrings # mongogrant # uvicorn colorama==0.4.6 # via # griffe # mkdocs-material contourpy==1.3.0 # via matplotlib coverage[toml]==7.6.1 # via pytest-cov cryptography==43.0.1 # via # azure-identity # azure-storage-blob # moto # msal # paramiko # pyjwt csscompressor==0.9.5 # via mkdocs-minify-plugin cycler==0.12.1 # via matplotlib decorator==5.1.1 # via ipython distlib==0.3.8 # via virtualenv dnspython==2.7.0 # via # maggma (pyproject.toml) # pymongo exceptiongroup==1.2.2 # via # anyio # ipython # pytest execnet==2.1.1 # via pytest-xdist executing==2.1.0 # via stack-data fastapi==0.115.0 # via maggma (pyproject.toml) fastjsonschema==2.20.0 # via nbformat filelock==3.16.1 # via virtualenv flask==3.0.3 # via mongogrant fonttools==4.54.1 # via matplotlib ghp-import==2.1.0 # via mkdocs griffe==1.3.2 # via mkdocstrings-python h11==0.14.0 # via # httpcore # uvicorn htmlmin2==0.1.13 # via mkdocs-minify-plugin httpcore==1.0.6 # via httpx httpx==0.27.2 # via starlette hvac==2.3.0 # via maggma (pyproject.toml) identify==2.6.1 # via pre-commit idna==3.10 # via # anyio # httpx # requests importlib-metadata==8.5.0 # via # flask # markdown # mkdocs # mkdocs-get-deps # mkdocstrings importlib-resources==6.4.5 # via # matplotlib # spglib iniconfig==2.0.0 # via pytest ipython==8.18.1 # via maggma (pyproject.toml) isodate==0.6.1 # via azure-storage-blob itsdangerous==2.2.0 # via # flask # starlette jedi==0.19.1 # via ipython jinja2==3.1.4 # via # flask # maggma (pyproject.toml) # memray # mkdocs # mkdocs-material # mkdocstrings # moto # starlette jmespath==1.0.1 # via # boto3 # botocore joblib==1.4.2 # via pymatgen jsmin==3.0.1 # via mkdocs-minify-plugin jsonlines==4.0.0 # via maggma (pyproject.toml) jsonschema==4.23.0 # via # maggma (pyproject.toml) # nbformat jsonschema-specifications==2023.12.1 # via jsonschema jupyter-core==5.7.2 # via nbformat kiwisolver==1.4.7 # via matplotlib latexcodec==3.0.0 # via pybtex linkify-it-py==2.0.3 # via markdown-it-py markdown==3.7 # via # mkdocs # mkdocs-autorefs # mkdocs-material # mkdocstrings # pymdown-extensions markdown-it-py[linkify,plugins]==3.0.0 # via # mdit-py-plugins # rich # textual markupsafe==2.1.5 # via # jinja2 # mkdocs # mkdocs-autorefs # mkdocstrings # werkzeug matplotlib==3.9.2 # via pymatgen matplotlib-inline==0.1.7 # via ipython mdit-py-plugins==0.4.2 # via markdown-it-py mdurl==0.1.2 # via markdown-it-py memray==1.14.0 # via maggma (pyproject.toml) mergedeep==1.3.4 # via # mkdocs # mkdocs-get-deps mkdocs==1.6.1 # via # maggma (pyproject.toml) # mkdocs-autorefs # mkdocs-material # mkdocs-minify-plugin # mkdocstrings mkdocs-autorefs==1.2.0 # via # mkdocstrings # mkdocstrings-python mkdocs-get-deps==0.2.0 # via mkdocs mkdocs-material==9.5.39 # via maggma (pyproject.toml) mkdocs-material-extensions==1.3.1 # via mkdocs-material mkdocs-minify-plugin==0.8.0 # via maggma (pyproject.toml) mkdocstrings[python]==0.26.1 # via # maggma (pyproject.toml) # mkdocstrings-python mkdocstrings-python==1.11.1 # via mkdocstrings mongogrant==0.3.3 # via maggma (pyproject.toml) mongomock==4.2.0.post1 # via maggma (pyproject.toml) monty==2024.7.30 # via # maggma (pyproject.toml) # pymatgen montydb==2.5.3 # via maggma (pyproject.toml) moto==5.0.16 # via maggma (pyproject.toml) mpmath==1.3.0 # via sympy msal==1.31.0 # via # azure-identity # msal-extensions msal-extensions==1.2.0 # via azure-identity msgpack==1.1.0 # via maggma (pyproject.toml) nbformat==5.10.4 # via maggma (pyproject.toml) networkx==3.2.1 # via pymatgen nodeenv==1.9.1 # via pre-commit numpy==2.0.2 # via # contourpy # maggma (pyproject.toml) # matplotlib # pandas # pymatgen # scipy # spglib orjson==3.10.7 # via maggma (pyproject.toml) packaging==24.1 # via # matplotlib # mkdocs # mongomock # plotly # pytest paginate==0.5.7 # via mkdocs-material palettable==3.3.3 # via pymatgen pandas==2.2.3 # via # maggma (pyproject.toml) # pymatgen paramiko==3.5.0 # via sshtunnel parso==0.8.4 # via jedi pathspec==0.12.1 # via mkdocs pexpect==4.9.0 # via ipython pillow==10.4.0 # via matplotlib platformdirs==4.3.6 # via # jupyter-core # mkdocs-get-deps # mkdocstrings # textual # virtualenv plotly==5.24.1 # via pymatgen pluggy==1.5.0 # via pytest portalocker==2.10.1 # via msal-extensions pre-commit==4.0.0 # via maggma (pyproject.toml) prompt-toolkit==3.0.48 # via ipython ptyprocess==0.7.0 # via pexpect pure-eval==0.2.3 # via stack-data pybtex==0.24.0 # via pymatgen pycparser==2.22 # via cffi pydantic==2.9.2 # via # fastapi # maggma (pyproject.toml) # pydantic-settings pydantic-core==2.23.4 # via pydantic pydantic-settings==2.5.2 # via maggma (pyproject.toml) pydash==8.0.3 # via maggma (pyproject.toml) pygments==2.18.0 # via # ipython # mkdocs-material # rich pyjwt[crypto]==2.9.0 # via # msal # pyjwt pymatgen==2024.8.9 # via maggma (pyproject.toml) pymdown-extensions==10.11.2 # via # mkdocs-material # mkdocstrings pymongo==4.10.1 # via # maggma (pyproject.toml) # mongogrant pynacl==1.5.0 # via paramiko pyparsing==3.1.4 # via matplotlib pytest==8.3.3 # via # maggma (pyproject.toml) # pytest-asyncio # pytest-cov # pytest-mock # pytest-xdist pytest-asyncio==0.24.0 # via maggma (pyproject.toml) pytest-cov==5.0.0 # via maggma (pyproject.toml) pytest-mock==3.14.0 # via maggma (pyproject.toml) pytest-xdist==3.6.1 # via maggma (pyproject.toml) python-dateutil==2.9.0.post0 # via # botocore # ghp-import # maggma (pyproject.toml) # matplotlib # moto # pandas python-dotenv==1.0.1 # via pydantic-settings python-multipart==0.0.12 # via starlette pytz==2024.2 # via # mongomock # pandas pyyaml==6.0.2 # via # mkdocs # mkdocs-get-deps # pre-commit # pybtex # pymdown-extensions # pyyaml-env-tag # starlette pyyaml-env-tag==0.1 # via mkdocs pyzmq==26.2.0 # via maggma (pyproject.toml) referencing==0.35.1 # via # jsonschema # jsonschema-specifications regex==2024.9.11 # via # maggma (pyproject.toml) # mkdocs-material requests==2.32.3 # via # azure-core # hvac # mkdocs-material # mongogrant # moto # msal # pymatgen # responses responses==0.21.0 # via # maggma (pyproject.toml) # moto rich==13.9.2 # via # memray # textual rpds-py==0.20.0 # via # jsonschema # referencing ruamel-yaml==0.18.6 # via # maggma (pyproject.toml) # pymatgen ruamel-yaml-clib==0.2.8 # via ruamel-yaml ruff==0.6.9 # via maggma (pyproject.toml) s3transfer==0.10.2 # via boto3 scipy==1.13.1 # via pymatgen sentinels==1.0.0 # via mongomock six==1.16.0 # via # asttokens # azure-core # isodate # pybtex # python-dateutil sniffio==1.3.1 # via # anyio # httpx spglib==2.5.0 # via pymatgen sshtunnel==0.4.0 # via maggma (pyproject.toml) stack-data==0.6.3 # via ipython starlette[full]==0.38.6 # via # fastapi # maggma (pyproject.toml) sympy==1.13.3 # via pymatgen tabulate==0.9.0 # via pymatgen tenacity==9.0.0 # via plotly textual==0.82.0 # via memray tomli==2.0.2 # via # coverage # pytest tqdm==4.66.5 # via # maggma (pyproject.toml) # pymatgen traitlets==5.14.3 # via # ipython # jupyter-core # matplotlib-inline # nbformat types-python-dateutil==2.9.0.20241003 # via maggma (pyproject.toml) types-pyyaml==6.0.12.20240917 # via maggma (pyproject.toml) types-setuptools==75.1.0.20240917 # via maggma (pyproject.toml) typing-extensions==4.12.2 # via # aioitertools # anyio # azure-core # azure-identity # azure-storage-blob # fastapi # ipython # mkdocstrings # pydantic # pydantic-core # pydash # rich # spglib # starlette # textual # uvicorn tzdata==2024.2 # via pandas uc-micro-py==1.0.3 # via linkify-it-py uncertainties==3.2.2 # via pymatgen urllib3==1.26.20 # via # botocore # requests # responses uvicorn==0.31.0 # via maggma (pyproject.toml) virtualenv==20.26.6 # via pre-commit watchdog==5.0.3 # via mkdocs wcwidth==0.2.13 # via prompt-toolkit werkzeug==3.0.4 # via # flask # moto xmltodict==0.13.0 # via moto zipp==3.20.2 # via # importlib-metadata # importlib-resources # The following packages are considered to be unsafe in a requirements file: # setuptools maggma-0.70.0/requirements/ubuntu-latest_py3.10.txt000066400000000000000000000050231470132070100221340ustar00rootroot00000000000000# # This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile --output-file=requirements/ubuntu-latest_py3.10.txt # aioitertools==0.12.0 # via maggma (pyproject.toml) annotated-types==0.7.0 # via pydantic attrs==24.2.0 # via # jsonlines # jsonschema # referencing bcrypt==4.2.0 # via paramiko boto3==1.35.34 # via maggma (pyproject.toml) botocore==1.35.34 # via # boto3 # s3transfer cffi==1.17.1 # via # cryptography # pynacl cryptography==43.0.1 # via paramiko dnspython==2.7.0 # via # maggma (pyproject.toml) # pymongo jmespath==1.0.1 # via # boto3 # botocore jsonlines==4.0.0 # via maggma (pyproject.toml) jsonschema==4.23.0 # via maggma (pyproject.toml) jsonschema-specifications==2023.12.1 # via jsonschema mongomock==4.2.0.post1 # via maggma (pyproject.toml) monty==2024.7.30 # via maggma (pyproject.toml) msgpack==1.1.0 # via maggma (pyproject.toml) numpy==2.1.2 # via # maggma (pyproject.toml) # pandas orjson==3.10.7 # via maggma (pyproject.toml) packaging==24.1 # via mongomock pandas==2.2.3 # via maggma (pyproject.toml) paramiko==3.5.0 # via sshtunnel pycparser==2.22 # via cffi pydantic==2.9.2 # via # maggma (pyproject.toml) # pydantic-settings pydantic-core==2.23.4 # via pydantic pydantic-settings==2.5.2 # via maggma (pyproject.toml) pydash==8.0.3 # via maggma (pyproject.toml) pymongo==4.10.1 # via maggma (pyproject.toml) pynacl==1.5.0 # via paramiko python-dateutil==2.9.0.post0 # via # botocore # maggma (pyproject.toml) # pandas python-dotenv==1.0.1 # via pydantic-settings pytz==2024.2 # via # mongomock # pandas pyzmq==26.2.0 # via maggma (pyproject.toml) referencing==0.35.1 # via # jsonschema # jsonschema-specifications rpds-py==0.20.0 # via # jsonschema # referencing ruamel-yaml==0.18.6 # via maggma (pyproject.toml) ruamel-yaml-clib==0.2.8 # via ruamel-yaml s3transfer==0.10.2 # via boto3 sentinels==1.0.0 # via mongomock six==1.16.0 # via python-dateutil sshtunnel==0.4.0 # via maggma (pyproject.toml) tqdm==4.66.5 # via maggma (pyproject.toml) typing-extensions==4.12.2 # via # pydantic # pydantic-core # pydash tzdata==2024.2 # via pandas urllib3==2.2.3 # via botocore # The following packages are considered to be unsafe in a requirements file: # setuptools maggma-0.70.0/requirements/ubuntu-latest_py3.10_extras.txt000066400000000000000000000242651470132070100235330ustar00rootroot00000000000000# # This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile --all-extras --output-file=requirements/ubuntu-latest_py3.10_extras.txt # aioitertools==0.12.0 # via maggma (pyproject.toml) annotated-types==0.7.0 # via pydantic anyio==4.6.0 # via # httpx # starlette asttokens==2.4.1 # via stack-data attrs==24.2.0 # via # jsonlines # jsonschema # referencing azure-core==1.31.0 # via # azure-identity # azure-storage-blob azure-identity==1.18.0 # via maggma (pyproject.toml) azure-storage-blob==12.23.1 # via maggma (pyproject.toml) babel==2.16.0 # via mkdocs-material bcrypt==4.2.0 # via paramiko blinker==1.8.2 # via flask boto3==1.35.34 # via # maggma (pyproject.toml) # moto botocore==1.35.34 # via # boto3 # moto # s3transfer certifi==2024.8.30 # via # httpcore # httpx # requests cffi==1.17.1 # via # cryptography # pynacl cfgv==3.4.0 # via pre-commit charset-normalizer==3.3.2 # via requests click==8.1.7 # via # flask # mkdocs # mkdocstrings # mongogrant # uvicorn colorama==0.4.6 # via # griffe # mkdocs-material contourpy==1.3.0 # via matplotlib coverage[toml]==7.6.1 # via pytest-cov cryptography==43.0.1 # via # azure-identity # azure-storage-blob # moto # msal # paramiko # pyjwt csscompressor==0.9.5 # via mkdocs-minify-plugin cycler==0.12.1 # via matplotlib decorator==5.1.1 # via ipython distlib==0.3.8 # via virtualenv dnspython==2.7.0 # via # maggma (pyproject.toml) # pymongo exceptiongroup==1.2.2 # via # anyio # ipython # pytest execnet==2.1.1 # via pytest-xdist executing==2.1.0 # via stack-data fastapi==0.115.0 # via maggma (pyproject.toml) fastjsonschema==2.20.0 # via nbformat filelock==3.16.1 # via virtualenv flask==3.0.3 # via mongogrant fonttools==4.54.1 # via matplotlib ghp-import==2.1.0 # via mkdocs griffe==1.3.2 # via mkdocstrings-python h11==0.14.0 # via # httpcore # uvicorn htmlmin2==0.1.13 # via mkdocs-minify-plugin httpcore==1.0.6 # via httpx httpx==0.27.2 # via starlette hvac==2.3.0 # via maggma (pyproject.toml) identify==2.6.1 # via pre-commit idna==3.10 # via # anyio # httpx # requests iniconfig==2.0.0 # via pytest ipython==8.28.0 # via maggma (pyproject.toml) isodate==0.6.1 # via azure-storage-blob itsdangerous==2.2.0 # via # flask # starlette jedi==0.19.1 # via ipython jinja2==3.1.4 # via # flask # maggma (pyproject.toml) # memray # mkdocs # mkdocs-material # mkdocstrings # moto # starlette jmespath==1.0.1 # via # boto3 # botocore joblib==1.4.2 # via pymatgen jsmin==3.0.1 # via mkdocs-minify-plugin jsonlines==4.0.0 # via maggma (pyproject.toml) jsonschema==4.23.0 # via # maggma (pyproject.toml) # nbformat jsonschema-specifications==2023.12.1 # via jsonschema jupyter-core==5.7.2 # via nbformat kiwisolver==1.4.7 # via matplotlib latexcodec==3.0.0 # via pybtex linkify-it-py==2.0.3 # via markdown-it-py markdown==3.7 # via # mkdocs # mkdocs-autorefs # mkdocs-material # mkdocstrings # pymdown-extensions markdown-it-py[linkify,plugins]==3.0.0 # via # mdit-py-plugins # rich # textual markupsafe==2.1.5 # via # jinja2 # mkdocs # mkdocs-autorefs # mkdocstrings # werkzeug matplotlib==3.9.2 # via pymatgen matplotlib-inline==0.1.7 # via ipython mdit-py-plugins==0.4.2 # via markdown-it-py mdurl==0.1.2 # via markdown-it-py memray==1.14.0 # via maggma (pyproject.toml) mergedeep==1.3.4 # via # mkdocs # mkdocs-get-deps mkdocs==1.6.1 # via # maggma (pyproject.toml) # mkdocs-autorefs # mkdocs-material # mkdocs-minify-plugin # mkdocstrings mkdocs-autorefs==1.2.0 # via # mkdocstrings # mkdocstrings-python mkdocs-get-deps==0.2.0 # via mkdocs mkdocs-material==9.5.39 # via maggma (pyproject.toml) mkdocs-material-extensions==1.3.1 # via mkdocs-material mkdocs-minify-plugin==0.8.0 # via maggma (pyproject.toml) mkdocstrings[python]==0.26.1 # via # maggma (pyproject.toml) # mkdocstrings-python mkdocstrings-python==1.11.1 # via mkdocstrings mongogrant==0.3.3 # via maggma (pyproject.toml) mongomock==4.2.0.post1 # via maggma (pyproject.toml) monty==2024.7.30 # via # maggma (pyproject.toml) # pymatgen montydb==2.5.3 # via maggma (pyproject.toml) moto==5.0.16 # via maggma (pyproject.toml) mpmath==1.3.0 # via sympy msal==1.31.0 # via # azure-identity # msal-extensions msal-extensions==1.2.0 # via azure-identity msgpack==1.1.0 # via maggma (pyproject.toml) nbformat==5.10.4 # via maggma (pyproject.toml) networkx==3.3 # via pymatgen nodeenv==1.9.1 # via pre-commit numpy==2.1.2 # via # contourpy # maggma (pyproject.toml) # matplotlib # pandas # pymatgen # scipy # spglib orjson==3.10.7 # via maggma (pyproject.toml) packaging==24.1 # via # matplotlib # mkdocs # mongomock # plotly # pytest paginate==0.5.7 # via mkdocs-material palettable==3.3.3 # via pymatgen pandas==2.2.3 # via # maggma (pyproject.toml) # pymatgen paramiko==3.5.0 # via sshtunnel parso==0.8.4 # via jedi pathspec==0.12.1 # via mkdocs pexpect==4.9.0 # via ipython pillow==10.4.0 # via matplotlib platformdirs==4.3.6 # via # jupyter-core # mkdocs-get-deps # mkdocstrings # textual # virtualenv plotly==5.24.1 # via pymatgen pluggy==1.5.0 # via pytest portalocker==2.10.1 # via msal-extensions pre-commit==4.0.0 # via maggma (pyproject.toml) prompt-toolkit==3.0.48 # via ipython ptyprocess==0.7.0 # via pexpect pure-eval==0.2.3 # via stack-data pybtex==0.24.0 # via pymatgen pycparser==2.22 # via cffi pydantic==2.9.2 # via # fastapi # maggma (pyproject.toml) # pydantic-settings pydantic-core==2.23.4 # via pydantic pydantic-settings==2.5.2 # via maggma (pyproject.toml) pydash==8.0.3 # via maggma (pyproject.toml) pygments==2.18.0 # via # ipython # mkdocs-material # rich pyjwt[crypto]==2.9.0 # via # msal # pyjwt pymatgen==2024.10.3 # via maggma (pyproject.toml) pymdown-extensions==10.11.2 # via # mkdocs-material # mkdocstrings pymongo==4.10.1 # via # maggma (pyproject.toml) # mongogrant pynacl==1.5.0 # via paramiko pyparsing==3.1.4 # via matplotlib pytest==8.3.3 # via # maggma (pyproject.toml) # pytest-asyncio # pytest-cov # pytest-mock # pytest-xdist pytest-asyncio==0.24.0 # via maggma (pyproject.toml) pytest-cov==5.0.0 # via maggma (pyproject.toml) pytest-mock==3.14.0 # via maggma (pyproject.toml) pytest-xdist==3.6.1 # via maggma (pyproject.toml) python-dateutil==2.9.0.post0 # via # botocore # ghp-import # maggma (pyproject.toml) # matplotlib # moto # pandas python-dotenv==1.0.1 # via pydantic-settings python-multipart==0.0.12 # via starlette pytz==2024.2 # via # mongomock # pandas pyyaml==6.0.2 # via # mkdocs # mkdocs-get-deps # pre-commit # pybtex # pymdown-extensions # pyyaml-env-tag # starlette pyyaml-env-tag==0.1 # via mkdocs pyzmq==26.2.0 # via maggma (pyproject.toml) referencing==0.35.1 # via # jsonschema # jsonschema-specifications regex==2024.9.11 # via # maggma (pyproject.toml) # mkdocs-material requests==2.32.3 # via # azure-core # hvac # mkdocs-material # mongogrant # moto # msal # pymatgen # responses responses==0.21.0 # via # maggma (pyproject.toml) # moto rich==13.9.2 # via # memray # textual rpds-py==0.20.0 # via # jsonschema # referencing ruamel-yaml==0.18.6 # via # maggma (pyproject.toml) # pymatgen ruamel-yaml-clib==0.2.8 # via ruamel-yaml ruff==0.6.9 # via maggma (pyproject.toml) s3transfer==0.10.2 # via boto3 scipy==1.14.1 # via pymatgen sentinels==1.0.0 # via mongomock six==1.16.0 # via # asttokens # azure-core # isodate # pybtex # python-dateutil sniffio==1.3.1 # via # anyio # httpx spglib==2.5.0 # via pymatgen sshtunnel==0.4.0 # via maggma (pyproject.toml) stack-data==0.6.3 # via ipython starlette[full]==0.38.6 # via # fastapi # maggma (pyproject.toml) sympy==1.13.3 # via pymatgen tabulate==0.9.0 # via pymatgen tenacity==9.0.0 # via plotly textual==0.82.0 # via memray tomli==2.0.2 # via # coverage # pytest tqdm==4.66.5 # via # maggma (pyproject.toml) # pymatgen traitlets==5.14.3 # via # ipython # jupyter-core # matplotlib-inline # nbformat types-python-dateutil==2.9.0.20241003 # via maggma (pyproject.toml) types-pyyaml==6.0.12.20240917 # via maggma (pyproject.toml) types-setuptools==75.1.0.20240917 # via maggma (pyproject.toml) typing-extensions==4.12.2 # via # anyio # azure-core # azure-identity # azure-storage-blob # fastapi # ipython # pydantic # pydantic-core # pydash # rich # textual # uvicorn tzdata==2024.2 # via pandas uc-micro-py==1.0.3 # via linkify-it-py uncertainties==3.2.2 # via pymatgen urllib3==2.2.3 # via # botocore # requests # responses uvicorn==0.31.0 # via maggma (pyproject.toml) virtualenv==20.26.6 # via pre-commit watchdog==5.0.3 # via mkdocs wcwidth==0.2.13 # via prompt-toolkit werkzeug==3.0.4 # via # flask # moto xmltodict==0.13.0 # via moto # The following packages are considered to be unsafe in a requirements file: # setuptools maggma-0.70.0/requirements/ubuntu-latest_py3.11.txt000066400000000000000000000050231470132070100221350ustar00rootroot00000000000000# # This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile --output-file=requirements/ubuntu-latest_py3.11.txt # aioitertools==0.12.0 # via maggma (pyproject.toml) annotated-types==0.7.0 # via pydantic attrs==24.2.0 # via # jsonlines # jsonschema # referencing bcrypt==4.2.0 # via paramiko boto3==1.35.34 # via maggma (pyproject.toml) botocore==1.35.34 # via # boto3 # s3transfer cffi==1.17.1 # via # cryptography # pynacl cryptography==43.0.1 # via paramiko dnspython==2.7.0 # via # maggma (pyproject.toml) # pymongo jmespath==1.0.1 # via # boto3 # botocore jsonlines==4.0.0 # via maggma (pyproject.toml) jsonschema==4.23.0 # via maggma (pyproject.toml) jsonschema-specifications==2023.12.1 # via jsonschema mongomock==4.2.0.post1 # via maggma (pyproject.toml) monty==2024.7.30 # via maggma (pyproject.toml) msgpack==1.1.0 # via maggma (pyproject.toml) numpy==2.1.2 # via # maggma (pyproject.toml) # pandas orjson==3.10.7 # via maggma (pyproject.toml) packaging==24.1 # via mongomock pandas==2.2.3 # via maggma (pyproject.toml) paramiko==3.5.0 # via sshtunnel pycparser==2.22 # via cffi pydantic==2.9.2 # via # maggma (pyproject.toml) # pydantic-settings pydantic-core==2.23.4 # via pydantic pydantic-settings==2.5.2 # via maggma (pyproject.toml) pydash==8.0.3 # via maggma (pyproject.toml) pymongo==4.10.1 # via maggma (pyproject.toml) pynacl==1.5.0 # via paramiko python-dateutil==2.9.0.post0 # via # botocore # maggma (pyproject.toml) # pandas python-dotenv==1.0.1 # via pydantic-settings pytz==2024.2 # via # mongomock # pandas pyzmq==26.2.0 # via maggma (pyproject.toml) referencing==0.35.1 # via # jsonschema # jsonschema-specifications rpds-py==0.20.0 # via # jsonschema # referencing ruamel-yaml==0.18.6 # via maggma (pyproject.toml) ruamel-yaml-clib==0.2.8 # via ruamel-yaml s3transfer==0.10.2 # via boto3 sentinels==1.0.0 # via mongomock six==1.16.0 # via python-dateutil sshtunnel==0.4.0 # via maggma (pyproject.toml) tqdm==4.66.5 # via maggma (pyproject.toml) typing-extensions==4.12.2 # via # pydantic # pydantic-core # pydash tzdata==2024.2 # via pandas urllib3==2.2.3 # via botocore # The following packages are considered to be unsafe in a requirements file: # setuptools maggma-0.70.0/requirements/ubuntu-latest_py3.11_extras.txt000066400000000000000000000240061470132070100235250ustar00rootroot00000000000000# # This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile --all-extras --output-file=requirements/ubuntu-latest_py3.11_extras.txt # aioitertools==0.12.0 # via maggma (pyproject.toml) annotated-types==0.7.0 # via pydantic anyio==4.6.0 # via # httpx # starlette asttokens==2.4.1 # via stack-data attrs==24.2.0 # via # jsonlines # jsonschema # referencing azure-core==1.31.0 # via # azure-identity # azure-storage-blob azure-identity==1.18.0 # via maggma (pyproject.toml) azure-storage-blob==12.23.1 # via maggma (pyproject.toml) babel==2.16.0 # via mkdocs-material bcrypt==4.2.0 # via paramiko blinker==1.8.2 # via flask boto3==1.35.34 # via # maggma (pyproject.toml) # moto botocore==1.35.34 # via # boto3 # moto # s3transfer certifi==2024.8.30 # via # httpcore # httpx # requests cffi==1.17.1 # via # cryptography # pynacl cfgv==3.4.0 # via pre-commit charset-normalizer==3.3.2 # via requests click==8.1.7 # via # flask # mkdocs # mkdocstrings # mongogrant # uvicorn colorama==0.4.6 # via # griffe # mkdocs-material contourpy==1.3.0 # via matplotlib coverage[toml]==7.6.1 # via pytest-cov cryptography==43.0.1 # via # azure-identity # azure-storage-blob # moto # msal # paramiko # pyjwt csscompressor==0.9.5 # via mkdocs-minify-plugin cycler==0.12.1 # via matplotlib decorator==5.1.1 # via ipython distlib==0.3.8 # via virtualenv dnspython==2.7.0 # via # maggma (pyproject.toml) # pymongo execnet==2.1.1 # via pytest-xdist executing==2.1.0 # via stack-data fastapi==0.115.0 # via maggma (pyproject.toml) fastjsonschema==2.20.0 # via nbformat filelock==3.16.1 # via virtualenv flask==3.0.3 # via mongogrant fonttools==4.54.1 # via matplotlib ghp-import==2.1.0 # via mkdocs griffe==1.3.2 # via mkdocstrings-python h11==0.14.0 # via # httpcore # uvicorn htmlmin2==0.1.13 # via mkdocs-minify-plugin httpcore==1.0.6 # via httpx httpx==0.27.2 # via starlette hvac==2.3.0 # via maggma (pyproject.toml) identify==2.6.1 # via pre-commit idna==3.10 # via # anyio # httpx # requests iniconfig==2.0.0 # via pytest ipython==8.28.0 # via maggma (pyproject.toml) isodate==0.6.1 # via azure-storage-blob itsdangerous==2.2.0 # via # flask # starlette jedi==0.19.1 # via ipython jinja2==3.1.4 # via # flask # maggma (pyproject.toml) # memray # mkdocs # mkdocs-material # mkdocstrings # moto # starlette jmespath==1.0.1 # via # boto3 # botocore joblib==1.4.2 # via pymatgen jsmin==3.0.1 # via mkdocs-minify-plugin jsonlines==4.0.0 # via maggma (pyproject.toml) jsonschema==4.23.0 # via # maggma (pyproject.toml) # nbformat jsonschema-specifications==2023.12.1 # via jsonschema jupyter-core==5.7.2 # via nbformat kiwisolver==1.4.7 # via matplotlib latexcodec==3.0.0 # via pybtex linkify-it-py==2.0.3 # via markdown-it-py markdown==3.7 # via # mkdocs # mkdocs-autorefs # mkdocs-material # mkdocstrings # pymdown-extensions markdown-it-py[linkify,plugins]==3.0.0 # via # mdit-py-plugins # rich # textual markupsafe==2.1.5 # via # jinja2 # mkdocs # mkdocs-autorefs # mkdocstrings # werkzeug matplotlib==3.9.2 # via pymatgen matplotlib-inline==0.1.7 # via ipython mdit-py-plugins==0.4.2 # via markdown-it-py mdurl==0.1.2 # via markdown-it-py memray==1.14.0 # via maggma (pyproject.toml) mergedeep==1.3.4 # via # mkdocs # mkdocs-get-deps mkdocs==1.6.1 # via # maggma (pyproject.toml) # mkdocs-autorefs # mkdocs-material # mkdocs-minify-plugin # mkdocstrings mkdocs-autorefs==1.2.0 # via # mkdocstrings # mkdocstrings-python mkdocs-get-deps==0.2.0 # via mkdocs mkdocs-material==9.5.39 # via maggma (pyproject.toml) mkdocs-material-extensions==1.3.1 # via mkdocs-material mkdocs-minify-plugin==0.8.0 # via maggma (pyproject.toml) mkdocstrings[python]==0.26.1 # via # maggma (pyproject.toml) # mkdocstrings-python mkdocstrings-python==1.11.1 # via mkdocstrings mongogrant==0.3.3 # via maggma (pyproject.toml) mongomock==4.2.0.post1 # via maggma (pyproject.toml) monty==2024.7.30 # via # maggma (pyproject.toml) # pymatgen montydb==2.5.3 # via maggma (pyproject.toml) moto==5.0.16 # via maggma (pyproject.toml) mpmath==1.3.0 # via sympy msal==1.31.0 # via # azure-identity # msal-extensions msal-extensions==1.2.0 # via azure-identity msgpack==1.1.0 # via maggma (pyproject.toml) nbformat==5.10.4 # via maggma (pyproject.toml) networkx==3.3 # via pymatgen nodeenv==1.9.1 # via pre-commit numpy==2.1.2 # via # contourpy # maggma (pyproject.toml) # matplotlib # pandas # pymatgen # scipy # spglib orjson==3.10.7 # via maggma (pyproject.toml) packaging==24.1 # via # matplotlib # mkdocs # mongomock # plotly # pytest paginate==0.5.7 # via mkdocs-material palettable==3.3.3 # via pymatgen pandas==2.2.3 # via # maggma (pyproject.toml) # pymatgen paramiko==3.5.0 # via sshtunnel parso==0.8.4 # via jedi pathspec==0.12.1 # via mkdocs pexpect==4.9.0 # via ipython pillow==10.4.0 # via matplotlib platformdirs==4.3.6 # via # jupyter-core # mkdocs-get-deps # mkdocstrings # textual # virtualenv plotly==5.24.1 # via pymatgen pluggy==1.5.0 # via pytest portalocker==2.10.1 # via msal-extensions pre-commit==4.0.0 # via maggma (pyproject.toml) prompt-toolkit==3.0.48 # via ipython ptyprocess==0.7.0 # via pexpect pure-eval==0.2.3 # via stack-data pybtex==0.24.0 # via pymatgen pycparser==2.22 # via cffi pydantic==2.9.2 # via # fastapi # maggma (pyproject.toml) # pydantic-settings pydantic-core==2.23.4 # via pydantic pydantic-settings==2.5.2 # via maggma (pyproject.toml) pydash==8.0.3 # via maggma (pyproject.toml) pygments==2.18.0 # via # ipython # mkdocs-material # rich pyjwt[crypto]==2.9.0 # via # msal # pyjwt pymatgen==2024.10.3 # via maggma (pyproject.toml) pymdown-extensions==10.11.2 # via # mkdocs-material # mkdocstrings pymongo==4.10.1 # via # maggma (pyproject.toml) # mongogrant pynacl==1.5.0 # via paramiko pyparsing==3.1.4 # via matplotlib pytest==8.3.3 # via # maggma (pyproject.toml) # pytest-asyncio # pytest-cov # pytest-mock # pytest-xdist pytest-asyncio==0.24.0 # via maggma (pyproject.toml) pytest-cov==5.0.0 # via maggma (pyproject.toml) pytest-mock==3.14.0 # via maggma (pyproject.toml) pytest-xdist==3.6.1 # via maggma (pyproject.toml) python-dateutil==2.9.0.post0 # via # botocore # ghp-import # maggma (pyproject.toml) # matplotlib # moto # pandas python-dotenv==1.0.1 # via pydantic-settings python-multipart==0.0.12 # via starlette pytz==2024.2 # via # mongomock # pandas pyyaml==6.0.2 # via # mkdocs # mkdocs-get-deps # pre-commit # pybtex # pymdown-extensions # pyyaml-env-tag # starlette pyyaml-env-tag==0.1 # via mkdocs pyzmq==26.2.0 # via maggma (pyproject.toml) referencing==0.35.1 # via # jsonschema # jsonschema-specifications regex==2024.9.11 # via # maggma (pyproject.toml) # mkdocs-material requests==2.32.3 # via # azure-core # hvac # mkdocs-material # mongogrant # moto # msal # pymatgen # responses responses==0.21.0 # via # maggma (pyproject.toml) # moto rich==13.9.2 # via # memray # textual rpds-py==0.20.0 # via # jsonschema # referencing ruamel-yaml==0.18.6 # via # maggma (pyproject.toml) # pymatgen ruamel-yaml-clib==0.2.8 # via ruamel-yaml ruff==0.6.9 # via maggma (pyproject.toml) s3transfer==0.10.2 # via boto3 scipy==1.14.1 # via pymatgen sentinels==1.0.0 # via mongomock six==1.16.0 # via # asttokens # azure-core # isodate # pybtex # python-dateutil sniffio==1.3.1 # via # anyio # httpx spglib==2.5.0 # via pymatgen sshtunnel==0.4.0 # via maggma (pyproject.toml) stack-data==0.6.3 # via ipython starlette[full]==0.38.6 # via # fastapi # maggma (pyproject.toml) sympy==1.13.3 # via pymatgen tabulate==0.9.0 # via pymatgen tenacity==9.0.0 # via plotly textual==0.82.0 # via memray tqdm==4.66.5 # via # maggma (pyproject.toml) # pymatgen traitlets==5.14.3 # via # ipython # jupyter-core # matplotlib-inline # nbformat types-python-dateutil==2.9.0.20241003 # via maggma (pyproject.toml) types-pyyaml==6.0.12.20240917 # via maggma (pyproject.toml) types-setuptools==75.1.0.20240917 # via maggma (pyproject.toml) typing-extensions==4.12.2 # via # azure-core # azure-identity # azure-storage-blob # fastapi # ipython # pydantic # pydantic-core # pydash # textual tzdata==2024.2 # via pandas uc-micro-py==1.0.3 # via linkify-it-py uncertainties==3.2.2 # via pymatgen urllib3==2.2.3 # via # botocore # requests # responses uvicorn==0.31.0 # via maggma (pyproject.toml) virtualenv==20.26.6 # via pre-commit watchdog==5.0.3 # via mkdocs wcwidth==0.2.13 # via prompt-toolkit werkzeug==3.0.4 # via # flask # moto xmltodict==0.13.0 # via moto # The following packages are considered to be unsafe in a requirements file: # setuptools maggma-0.70.0/requirements/ubuntu-latest_py3.12.txt000066400000000000000000000050231470132070100221360ustar00rootroot00000000000000# # This file is autogenerated by pip-compile with Python 3.12 # by the following command: # # pip-compile --output-file=requirements/ubuntu-latest_py3.12.txt # aioitertools==0.12.0 # via maggma (pyproject.toml) annotated-types==0.7.0 # via pydantic attrs==24.2.0 # via # jsonlines # jsonschema # referencing bcrypt==4.2.0 # via paramiko boto3==1.35.34 # via maggma (pyproject.toml) botocore==1.35.34 # via # boto3 # s3transfer cffi==1.17.1 # via # cryptography # pynacl cryptography==43.0.1 # via paramiko dnspython==2.7.0 # via # maggma (pyproject.toml) # pymongo jmespath==1.0.1 # via # boto3 # botocore jsonlines==4.0.0 # via maggma (pyproject.toml) jsonschema==4.23.0 # via maggma (pyproject.toml) jsonschema-specifications==2023.12.1 # via jsonschema mongomock==4.2.0.post1 # via maggma (pyproject.toml) monty==2024.7.30 # via maggma (pyproject.toml) msgpack==1.1.0 # via maggma (pyproject.toml) numpy==2.1.2 # via # maggma (pyproject.toml) # pandas orjson==3.10.7 # via maggma (pyproject.toml) packaging==24.1 # via mongomock pandas==2.2.3 # via maggma (pyproject.toml) paramiko==3.5.0 # via sshtunnel pycparser==2.22 # via cffi pydantic==2.9.2 # via # maggma (pyproject.toml) # pydantic-settings pydantic-core==2.23.4 # via pydantic pydantic-settings==2.5.2 # via maggma (pyproject.toml) pydash==8.0.3 # via maggma (pyproject.toml) pymongo==4.10.1 # via maggma (pyproject.toml) pynacl==1.5.0 # via paramiko python-dateutil==2.9.0.post0 # via # botocore # maggma (pyproject.toml) # pandas python-dotenv==1.0.1 # via pydantic-settings pytz==2024.2 # via # mongomock # pandas pyzmq==26.2.0 # via maggma (pyproject.toml) referencing==0.35.1 # via # jsonschema # jsonschema-specifications rpds-py==0.20.0 # via # jsonschema # referencing ruamel-yaml==0.18.6 # via maggma (pyproject.toml) ruamel-yaml-clib==0.2.8 # via ruamel-yaml s3transfer==0.10.2 # via boto3 sentinels==1.0.0 # via mongomock six==1.16.0 # via python-dateutil sshtunnel==0.4.0 # via maggma (pyproject.toml) tqdm==4.66.5 # via maggma (pyproject.toml) typing-extensions==4.12.2 # via # pydantic # pydantic-core # pydash tzdata==2024.2 # via pandas urllib3==2.2.3 # via botocore # The following packages are considered to be unsafe in a requirements file: # setuptools maggma-0.70.0/requirements/ubuntu-latest_py3.12_extras.txt000066400000000000000000000237661470132070100235420ustar00rootroot00000000000000# # This file is autogenerated by pip-compile with Python 3.12 # by the following command: # # pip-compile --all-extras --output-file=requirements/ubuntu-latest_py3.12_extras.txt # aioitertools==0.12.0 # via maggma (pyproject.toml) annotated-types==0.7.0 # via pydantic anyio==4.6.0 # via # httpx # starlette asttokens==2.4.1 # via stack-data attrs==24.2.0 # via # jsonlines # jsonschema # referencing azure-core==1.31.0 # via # azure-identity # azure-storage-blob azure-identity==1.18.0 # via maggma (pyproject.toml) azure-storage-blob==12.23.1 # via maggma (pyproject.toml) babel==2.16.0 # via mkdocs-material bcrypt==4.2.0 # via paramiko blinker==1.8.2 # via flask boto3==1.35.34 # via # maggma (pyproject.toml) # moto botocore==1.35.34 # via # boto3 # moto # s3transfer certifi==2024.8.30 # via # httpcore # httpx # requests cffi==1.17.1 # via # cryptography # pynacl cfgv==3.4.0 # via pre-commit charset-normalizer==3.3.2 # via requests click==8.1.7 # via # flask # mkdocs # mkdocstrings # mongogrant # uvicorn colorama==0.4.6 # via # griffe # mkdocs-material contourpy==1.3.0 # via matplotlib coverage[toml]==7.6.1 # via pytest-cov cryptography==43.0.1 # via # azure-identity # azure-storage-blob # moto # msal # paramiko # pyjwt csscompressor==0.9.5 # via mkdocs-minify-plugin cycler==0.12.1 # via matplotlib decorator==5.1.1 # via ipython distlib==0.3.8 # via virtualenv dnspython==2.7.0 # via # maggma (pyproject.toml) # pymongo execnet==2.1.1 # via pytest-xdist executing==2.1.0 # via stack-data fastapi==0.115.0 # via maggma (pyproject.toml) fastjsonschema==2.20.0 # via nbformat filelock==3.16.1 # via virtualenv flask==3.0.3 # via mongogrant fonttools==4.54.1 # via matplotlib ghp-import==2.1.0 # via mkdocs griffe==1.3.2 # via mkdocstrings-python h11==0.14.0 # via # httpcore # uvicorn htmlmin2==0.1.13 # via mkdocs-minify-plugin httpcore==1.0.6 # via httpx httpx==0.27.2 # via starlette hvac==2.3.0 # via maggma (pyproject.toml) identify==2.6.1 # via pre-commit idna==3.10 # via # anyio # httpx # requests iniconfig==2.0.0 # via pytest ipython==8.28.0 # via maggma (pyproject.toml) isodate==0.6.1 # via azure-storage-blob itsdangerous==2.2.0 # via # flask # starlette jedi==0.19.1 # via ipython jinja2==3.1.4 # via # flask # maggma (pyproject.toml) # memray # mkdocs # mkdocs-material # mkdocstrings # moto # starlette jmespath==1.0.1 # via # boto3 # botocore joblib==1.4.2 # via pymatgen jsmin==3.0.1 # via mkdocs-minify-plugin jsonlines==4.0.0 # via maggma (pyproject.toml) jsonschema==4.23.0 # via # maggma (pyproject.toml) # nbformat jsonschema-specifications==2023.12.1 # via jsonschema jupyter-core==5.7.2 # via nbformat kiwisolver==1.4.7 # via matplotlib latexcodec==3.0.0 # via pybtex linkify-it-py==2.0.3 # via markdown-it-py markdown==3.7 # via # mkdocs # mkdocs-autorefs # mkdocs-material # mkdocstrings # pymdown-extensions markdown-it-py[linkify,plugins]==3.0.0 # via # mdit-py-plugins # rich # textual markupsafe==2.1.5 # via # jinja2 # mkdocs # mkdocs-autorefs # mkdocstrings # werkzeug matplotlib==3.9.2 # via pymatgen matplotlib-inline==0.1.7 # via ipython mdit-py-plugins==0.4.2 # via markdown-it-py mdurl==0.1.2 # via markdown-it-py memray==1.14.0 # via maggma (pyproject.toml) mergedeep==1.3.4 # via # mkdocs # mkdocs-get-deps mkdocs==1.6.1 # via # maggma (pyproject.toml) # mkdocs-autorefs # mkdocs-material # mkdocs-minify-plugin # mkdocstrings mkdocs-autorefs==1.2.0 # via # mkdocstrings # mkdocstrings-python mkdocs-get-deps==0.2.0 # via mkdocs mkdocs-material==9.5.39 # via maggma (pyproject.toml) mkdocs-material-extensions==1.3.1 # via mkdocs-material mkdocs-minify-plugin==0.8.0 # via maggma (pyproject.toml) mkdocstrings[python]==0.26.1 # via # maggma (pyproject.toml) # mkdocstrings-python mkdocstrings-python==1.11.1 # via mkdocstrings mongogrant==0.3.3 # via maggma (pyproject.toml) mongomock==4.2.0.post1 # via maggma (pyproject.toml) monty==2024.7.30 # via # maggma (pyproject.toml) # pymatgen montydb==2.5.3 # via maggma (pyproject.toml) moto==5.0.16 # via maggma (pyproject.toml) mpmath==1.3.0 # via sympy msal==1.31.0 # via # azure-identity # msal-extensions msal-extensions==1.2.0 # via azure-identity msgpack==1.1.0 # via maggma (pyproject.toml) nbformat==5.10.4 # via maggma (pyproject.toml) networkx==3.3 # via pymatgen nodeenv==1.9.1 # via pre-commit numpy==2.1.2 # via # contourpy # maggma (pyproject.toml) # matplotlib # pandas # pymatgen # scipy # spglib orjson==3.10.7 # via maggma (pyproject.toml) packaging==24.1 # via # matplotlib # mkdocs # mongomock # plotly # pytest paginate==0.5.7 # via mkdocs-material palettable==3.3.3 # via pymatgen pandas==2.2.3 # via # maggma (pyproject.toml) # pymatgen paramiko==3.5.0 # via sshtunnel parso==0.8.4 # via jedi pathspec==0.12.1 # via mkdocs pexpect==4.9.0 # via ipython pillow==10.4.0 # via matplotlib platformdirs==4.3.6 # via # jupyter-core # mkdocs-get-deps # mkdocstrings # textual # virtualenv plotly==5.24.1 # via pymatgen pluggy==1.5.0 # via pytest portalocker==2.10.1 # via msal-extensions pre-commit==4.0.0 # via maggma (pyproject.toml) prompt-toolkit==3.0.48 # via ipython ptyprocess==0.7.0 # via pexpect pure-eval==0.2.3 # via stack-data pybtex==0.24.0 # via pymatgen pycparser==2.22 # via cffi pydantic==2.9.2 # via # fastapi # maggma (pyproject.toml) # pydantic-settings pydantic-core==2.23.4 # via pydantic pydantic-settings==2.5.2 # via maggma (pyproject.toml) pydash==8.0.3 # via maggma (pyproject.toml) pygments==2.18.0 # via # ipython # mkdocs-material # rich pyjwt[crypto]==2.9.0 # via # msal # pyjwt pymatgen==2024.10.3 # via maggma (pyproject.toml) pymdown-extensions==10.11.2 # via # mkdocs-material # mkdocstrings pymongo==4.10.1 # via # maggma (pyproject.toml) # mongogrant pynacl==1.5.0 # via paramiko pyparsing==3.1.4 # via matplotlib pytest==8.3.3 # via # maggma (pyproject.toml) # pytest-asyncio # pytest-cov # pytest-mock # pytest-xdist pytest-asyncio==0.24.0 # via maggma (pyproject.toml) pytest-cov==5.0.0 # via maggma (pyproject.toml) pytest-mock==3.14.0 # via maggma (pyproject.toml) pytest-xdist==3.6.1 # via maggma (pyproject.toml) python-dateutil==2.9.0.post0 # via # botocore # ghp-import # maggma (pyproject.toml) # matplotlib # moto # pandas python-dotenv==1.0.1 # via pydantic-settings python-multipart==0.0.12 # via starlette pytz==2024.2 # via # mongomock # pandas pyyaml==6.0.2 # via # mkdocs # mkdocs-get-deps # pre-commit # pybtex # pymdown-extensions # pyyaml-env-tag # starlette pyyaml-env-tag==0.1 # via mkdocs pyzmq==26.2.0 # via maggma (pyproject.toml) referencing==0.35.1 # via # jsonschema # jsonschema-specifications regex==2024.9.11 # via # maggma (pyproject.toml) # mkdocs-material requests==2.32.3 # via # azure-core # hvac # mkdocs-material # mongogrant # moto # msal # pymatgen # responses responses==0.21.0 # via # maggma (pyproject.toml) # moto rich==13.9.2 # via # memray # textual rpds-py==0.20.0 # via # jsonschema # referencing ruamel-yaml==0.18.6 # via # maggma (pyproject.toml) # pymatgen ruamel-yaml-clib==0.2.8 # via ruamel-yaml ruff==0.6.9 # via maggma (pyproject.toml) s3transfer==0.10.2 # via boto3 scipy==1.14.1 # via pymatgen sentinels==1.0.0 # via mongomock six==1.16.0 # via # asttokens # azure-core # isodate # pybtex # python-dateutil sniffio==1.3.1 # via # anyio # httpx spglib==2.5.0 # via pymatgen sshtunnel==0.4.0 # via maggma (pyproject.toml) stack-data==0.6.3 # via ipython starlette[full]==0.38.6 # via # fastapi # maggma (pyproject.toml) sympy==1.13.3 # via pymatgen tabulate==0.9.0 # via pymatgen tenacity==9.0.0 # via plotly textual==0.82.0 # via memray tqdm==4.66.5 # via # maggma (pyproject.toml) # pymatgen traitlets==5.14.3 # via # ipython # jupyter-core # matplotlib-inline # nbformat types-python-dateutil==2.9.0.20241003 # via maggma (pyproject.toml) types-pyyaml==6.0.12.20240917 # via maggma (pyproject.toml) types-setuptools==75.1.0.20240917 # via maggma (pyproject.toml) typing-extensions==4.12.2 # via # azure-core # azure-identity # azure-storage-blob # fastapi # pydantic # pydantic-core # pydash # textual tzdata==2024.2 # via pandas uc-micro-py==1.0.3 # via linkify-it-py uncertainties==3.2.2 # via pymatgen urllib3==2.2.3 # via # botocore # requests # responses uvicorn==0.31.0 # via maggma (pyproject.toml) virtualenv==20.26.6 # via pre-commit watchdog==5.0.3 # via mkdocs wcwidth==0.2.13 # via prompt-toolkit werkzeug==3.0.4 # via # flask # moto xmltodict==0.13.0 # via moto # The following packages are considered to be unsafe in a requirements file: # setuptools maggma-0.70.0/requirements/ubuntu-latest_py3.9.txt000066400000000000000000000050501470132070100220640ustar00rootroot00000000000000# # This file is autogenerated by pip-compile with Python 3.9 # by the following command: # # pip-compile --output-file=requirements/ubuntu-latest_py3.9.txt # aioitertools==0.12.0 # via maggma (pyproject.toml) annotated-types==0.7.0 # via pydantic attrs==24.2.0 # via # jsonlines # jsonschema # referencing bcrypt==4.2.0 # via paramiko boto3==1.35.34 # via maggma (pyproject.toml) botocore==1.35.34 # via # boto3 # s3transfer cffi==1.17.1 # via # cryptography # pynacl cryptography==43.0.1 # via paramiko dnspython==2.7.0 # via # maggma (pyproject.toml) # pymongo jmespath==1.0.1 # via # boto3 # botocore jsonlines==4.0.0 # via maggma (pyproject.toml) jsonschema==4.23.0 # via maggma (pyproject.toml) jsonschema-specifications==2023.12.1 # via jsonschema mongomock==4.2.0.post1 # via maggma (pyproject.toml) monty==2024.7.30 # via maggma (pyproject.toml) msgpack==1.1.0 # via maggma (pyproject.toml) numpy==2.0.2 # via # maggma (pyproject.toml) # pandas orjson==3.10.7 # via maggma (pyproject.toml) packaging==24.1 # via mongomock pandas==2.2.3 # via maggma (pyproject.toml) paramiko==3.5.0 # via sshtunnel pycparser==2.22 # via cffi pydantic==2.9.2 # via # maggma (pyproject.toml) # pydantic-settings pydantic-core==2.23.4 # via pydantic pydantic-settings==2.5.2 # via maggma (pyproject.toml) pydash==8.0.3 # via maggma (pyproject.toml) pymongo==4.10.1 # via maggma (pyproject.toml) pynacl==1.5.0 # via paramiko python-dateutil==2.9.0.post0 # via # botocore # maggma (pyproject.toml) # pandas python-dotenv==1.0.1 # via pydantic-settings pytz==2024.2 # via # mongomock # pandas pyzmq==26.2.0 # via maggma (pyproject.toml) referencing==0.35.1 # via # jsonschema # jsonschema-specifications rpds-py==0.20.0 # via # jsonschema # referencing ruamel-yaml==0.18.6 # via maggma (pyproject.toml) ruamel-yaml-clib==0.2.8 # via ruamel-yaml s3transfer==0.10.2 # via boto3 sentinels==1.0.0 # via mongomock six==1.16.0 # via python-dateutil sshtunnel==0.4.0 # via maggma (pyproject.toml) tqdm==4.66.5 # via maggma (pyproject.toml) typing-extensions==4.12.2 # via # aioitertools # pydantic # pydantic-core # pydash tzdata==2024.2 # via pandas urllib3==1.26.20 # via botocore # The following packages are considered to be unsafe in a requirements file: # setuptools maggma-0.70.0/requirements/ubuntu-latest_py3.9_extras.txt000066400000000000000000000250251470132070100234560ustar00rootroot00000000000000# # This file is autogenerated by pip-compile with Python 3.9 # by the following command: # # pip-compile --all-extras --output-file=requirements/ubuntu-latest_py3.9_extras.txt # aioitertools==0.12.0 # via maggma (pyproject.toml) annotated-types==0.7.0 # via pydantic anyio==4.6.0 # via # httpx # starlette asttokens==2.4.1 # via stack-data attrs==24.2.0 # via # jsonlines # jsonschema # referencing azure-core==1.31.0 # via # azure-identity # azure-storage-blob azure-identity==1.18.0 # via maggma (pyproject.toml) azure-storage-blob==12.23.1 # via maggma (pyproject.toml) babel==2.16.0 # via mkdocs-material bcrypt==4.2.0 # via paramiko blinker==1.8.2 # via flask boto3==1.35.34 # via # maggma (pyproject.toml) # moto botocore==1.35.34 # via # boto3 # moto # s3transfer certifi==2024.8.30 # via # httpcore # httpx # requests cffi==1.17.1 # via # cryptography # pynacl cfgv==3.4.0 # via pre-commit charset-normalizer==3.3.2 # via requests click==8.1.7 # via # flask # mkdocs # mkdocstrings # mongogrant # uvicorn colorama==0.4.6 # via # griffe # mkdocs-material contourpy==1.3.0 # via matplotlib coverage[toml]==7.6.1 # via pytest-cov cryptography==43.0.1 # via # azure-identity # azure-storage-blob # moto # msal # paramiko # pyjwt csscompressor==0.9.5 # via mkdocs-minify-plugin cycler==0.12.1 # via matplotlib decorator==5.1.1 # via ipython distlib==0.3.8 # via virtualenv dnspython==2.7.0 # via # maggma (pyproject.toml) # pymongo exceptiongroup==1.2.2 # via # anyio # ipython # pytest execnet==2.1.1 # via pytest-xdist executing==2.1.0 # via stack-data fastapi==0.115.0 # via maggma (pyproject.toml) fastjsonschema==2.20.0 # via nbformat filelock==3.16.1 # via virtualenv flask==3.0.3 # via mongogrant fonttools==4.54.1 # via matplotlib ghp-import==2.1.0 # via mkdocs griffe==1.3.2 # via mkdocstrings-python h11==0.14.0 # via # httpcore # uvicorn htmlmin2==0.1.13 # via mkdocs-minify-plugin httpcore==1.0.6 # via httpx httpx==0.27.2 # via starlette hvac==2.3.0 # via maggma (pyproject.toml) identify==2.6.1 # via pre-commit idna==3.10 # via # anyio # httpx # requests importlib-metadata==8.5.0 # via # flask # markdown # mkdocs # mkdocs-get-deps # mkdocstrings importlib-resources==6.4.5 # via # matplotlib # spglib iniconfig==2.0.0 # via pytest ipython==8.18.1 # via maggma (pyproject.toml) isodate==0.6.1 # via azure-storage-blob itsdangerous==2.2.0 # via # flask # starlette jedi==0.19.1 # via ipython jinja2==3.1.4 # via # flask # maggma (pyproject.toml) # memray # mkdocs # mkdocs-material # mkdocstrings # moto # starlette jmespath==1.0.1 # via # boto3 # botocore joblib==1.4.2 # via pymatgen jsmin==3.0.1 # via mkdocs-minify-plugin jsonlines==4.0.0 # via maggma (pyproject.toml) jsonschema==4.23.0 # via # maggma (pyproject.toml) # nbformat jsonschema-specifications==2023.12.1 # via jsonschema jupyter-core==5.7.2 # via nbformat kiwisolver==1.4.7 # via matplotlib latexcodec==3.0.0 # via pybtex linkify-it-py==2.0.3 # via markdown-it-py markdown==3.7 # via # mkdocs # mkdocs-autorefs # mkdocs-material # mkdocstrings # pymdown-extensions markdown-it-py[linkify,plugins]==3.0.0 # via # mdit-py-plugins # rich # textual markupsafe==2.1.5 # via # jinja2 # mkdocs # mkdocs-autorefs # mkdocstrings # werkzeug matplotlib==3.9.2 # via pymatgen matplotlib-inline==0.1.7 # via ipython mdit-py-plugins==0.4.2 # via markdown-it-py mdurl==0.1.2 # via markdown-it-py memray==1.14.0 # via maggma (pyproject.toml) mergedeep==1.3.4 # via # mkdocs # mkdocs-get-deps mkdocs==1.6.1 # via # maggma (pyproject.toml) # mkdocs-autorefs # mkdocs-material # mkdocs-minify-plugin # mkdocstrings mkdocs-autorefs==1.2.0 # via # mkdocstrings # mkdocstrings-python mkdocs-get-deps==0.2.0 # via mkdocs mkdocs-material==9.5.39 # via maggma (pyproject.toml) mkdocs-material-extensions==1.3.1 # via mkdocs-material mkdocs-minify-plugin==0.8.0 # via maggma (pyproject.toml) mkdocstrings[python]==0.26.1 # via # maggma (pyproject.toml) # mkdocstrings-python mkdocstrings-python==1.11.1 # via mkdocstrings mongogrant==0.3.3 # via maggma (pyproject.toml) mongomock==4.2.0.post1 # via maggma (pyproject.toml) monty==2024.7.30 # via # maggma (pyproject.toml) # pymatgen montydb==2.5.3 # via maggma (pyproject.toml) moto==5.0.16 # via maggma (pyproject.toml) mpmath==1.3.0 # via sympy msal==1.31.0 # via # azure-identity # msal-extensions msal-extensions==1.2.0 # via azure-identity msgpack==1.1.0 # via maggma (pyproject.toml) nbformat==5.10.4 # via maggma (pyproject.toml) networkx==3.2.1 # via pymatgen nodeenv==1.9.1 # via pre-commit numpy==2.0.2 # via # contourpy # maggma (pyproject.toml) # matplotlib # pandas # pymatgen # scipy # spglib orjson==3.10.7 # via maggma (pyproject.toml) packaging==24.1 # via # matplotlib # mkdocs # mongomock # plotly # pytest paginate==0.5.7 # via mkdocs-material palettable==3.3.3 # via pymatgen pandas==2.2.3 # via # maggma (pyproject.toml) # pymatgen paramiko==3.5.0 # via sshtunnel parso==0.8.4 # via jedi pathspec==0.12.1 # via mkdocs pexpect==4.9.0 # via ipython pillow==10.4.0 # via matplotlib platformdirs==4.3.6 # via # jupyter-core # mkdocs-get-deps # mkdocstrings # textual # virtualenv plotly==5.24.1 # via pymatgen pluggy==1.5.0 # via pytest portalocker==2.10.1 # via msal-extensions pre-commit==4.0.0 # via maggma (pyproject.toml) prompt-toolkit==3.0.48 # via ipython ptyprocess==0.7.0 # via pexpect pure-eval==0.2.3 # via stack-data pybtex==0.24.0 # via pymatgen pycparser==2.22 # via cffi pydantic==2.9.2 # via # fastapi # maggma (pyproject.toml) # pydantic-settings pydantic-core==2.23.4 # via pydantic pydantic-settings==2.5.2 # via maggma (pyproject.toml) pydash==8.0.3 # via maggma (pyproject.toml) pygments==2.18.0 # via # ipython # mkdocs-material # rich pyjwt[crypto]==2.9.0 # via # msal # pyjwt pymatgen==2024.8.9 # via maggma (pyproject.toml) pymdown-extensions==10.11.2 # via # mkdocs-material # mkdocstrings pymongo==4.10.1 # via # maggma (pyproject.toml) # mongogrant pynacl==1.5.0 # via paramiko pyparsing==3.1.4 # via matplotlib pytest==8.3.3 # via # maggma (pyproject.toml) # pytest-asyncio # pytest-cov # pytest-mock # pytest-xdist pytest-asyncio==0.24.0 # via maggma (pyproject.toml) pytest-cov==5.0.0 # via maggma (pyproject.toml) pytest-mock==3.14.0 # via maggma (pyproject.toml) pytest-xdist==3.6.1 # via maggma (pyproject.toml) python-dateutil==2.9.0.post0 # via # botocore # ghp-import # maggma (pyproject.toml) # matplotlib # moto # pandas python-dotenv==1.0.1 # via pydantic-settings python-multipart==0.0.12 # via starlette pytz==2024.2 # via # mongomock # pandas pyyaml==6.0.2 # via # mkdocs # mkdocs-get-deps # pre-commit # pybtex # pymdown-extensions # pyyaml-env-tag # starlette pyyaml-env-tag==0.1 # via mkdocs pyzmq==26.2.0 # via maggma (pyproject.toml) referencing==0.35.1 # via # jsonschema # jsonschema-specifications regex==2024.9.11 # via # maggma (pyproject.toml) # mkdocs-material requests==2.32.3 # via # azure-core # hvac # mkdocs-material # mongogrant # moto # msal # pymatgen # responses responses==0.21.0 # via # maggma (pyproject.toml) # moto rich==13.9.2 # via # memray # textual rpds-py==0.20.0 # via # jsonschema # referencing ruamel-yaml==0.18.6 # via # maggma (pyproject.toml) # pymatgen ruamel-yaml-clib==0.2.8 # via ruamel-yaml ruff==0.6.9 # via maggma (pyproject.toml) s3transfer==0.10.2 # via boto3 scipy==1.13.1 # via pymatgen sentinels==1.0.0 # via mongomock six==1.16.0 # via # asttokens # azure-core # isodate # pybtex # python-dateutil sniffio==1.3.1 # via # anyio # httpx spglib==2.5.0 # via pymatgen sshtunnel==0.4.0 # via maggma (pyproject.toml) stack-data==0.6.3 # via ipython starlette[full]==0.38.6 # via # fastapi # maggma (pyproject.toml) sympy==1.13.3 # via pymatgen tabulate==0.9.0 # via pymatgen tenacity==9.0.0 # via plotly textual==0.82.0 # via memray tomli==2.0.2 # via # coverage # pytest tqdm==4.66.5 # via # maggma (pyproject.toml) # pymatgen traitlets==5.14.3 # via # ipython # jupyter-core # matplotlib-inline # nbformat types-python-dateutil==2.9.0.20241003 # via maggma (pyproject.toml) types-pyyaml==6.0.12.20240917 # via maggma (pyproject.toml) types-setuptools==75.1.0.20240917 # via maggma (pyproject.toml) typing-extensions==4.12.2 # via # aioitertools # anyio # azure-core # azure-identity # azure-storage-blob # fastapi # ipython # mkdocstrings # pydantic # pydantic-core # pydash # rich # spglib # starlette # textual # uvicorn tzdata==2024.2 # via pandas uc-micro-py==1.0.3 # via linkify-it-py uncertainties==3.2.2 # via pymatgen urllib3==1.26.20 # via # botocore # requests # responses uvicorn==0.31.0 # via maggma (pyproject.toml) virtualenv==20.26.6 # via pre-commit watchdog==5.0.3 # via mkdocs wcwidth==0.2.13 # via prompt-toolkit werkzeug==3.0.4 # via # flask # moto xmltodict==0.13.0 # via moto zipp==3.20.2 # via # importlib-metadata # importlib-resources # The following packages are considered to be unsafe in a requirements file: # setuptools maggma-0.70.0/src/000077500000000000000000000000001470132070100136115ustar00rootroot00000000000000maggma-0.70.0/src/maggma/000077500000000000000000000000001470132070100150425ustar00rootroot00000000000000maggma-0.70.0/src/maggma/__init__.py000066400000000000000000000003361470132070100171550ustar00rootroot00000000000000"""Primary Maggma module.""" from importlib.metadata import PackageNotFoundError, version try: __version__ = version("maggma") except PackageNotFoundError: # pragma: no cover # package is not installed pass maggma-0.70.0/src/maggma/api/000077500000000000000000000000001470132070100156135ustar00rootroot00000000000000maggma-0.70.0/src/maggma/api/API.py000066400000000000000000000076701470132070100166100ustar00rootroot00000000000000from datetime import datetime from typing import Optional import uvicorn from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.gzip import GZipMiddleware from monty.json import MSONable from starlette.responses import RedirectResponse from maggma.api.resource import Resource class API(MSONable): """ Basic API manager to tie together various resources. """ def __init__( self, resources: dict[str, list[Resource]], title: str = "Generic API", version: str = "v0.0.0", debug: bool = False, heartbeat_meta: Optional[dict] = None, description: Optional[str] = None, tags_meta: Optional[list[dict]] = None, ): """ Args: resources: dictionary of resource objects and http prefix they live in title: a string title for this API version: the version for this API debug: turns debug on in FastAPI heartbeat_meta: dictionary of additional metadata to include in the heartbeat response description: description of the API to be used in the generated docs tags_meta: descriptions of tags to be used in the generated docs. """ self.title = title self.version = version self.debug = debug self.heartbeat_meta = heartbeat_meta self.description = description self.tags_meta = tags_meta if len(resources) == 0: raise RuntimeError("ERROR: There are no endpoints provided") self.resources = resources def on_startup(self): """ Basic startup that runs the resource startup functions. """ for resource_list in self.resources.values(): for resource in resource_list: resource.on_startup() @property def app(self): """ App server for the cluster manager. """ app = FastAPI( title=self.title, version=self.version, on_startup=[self.on_startup], debug=self.debug, description=self.description, openapi_tags=self.tags_meta, ) # Allow requests from other domains in debug mode. This allows # testing with local deployments of other services. For production # deployment, this will be taken care of by nginx. if self.debug: app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_methods=["GET"], allow_headers=["*"], ) for prefix, resource_list in self.resources.items(): main_resource = resource_list.pop(0) for resource in resource_list: main_resource.router.include_router(resource.router) app.include_router(main_resource.router, prefix=f"/{prefix}") app.add_middleware(GZipMiddleware, minimum_size=1000) @app.get("/heartbeat", include_in_schema=False) @app.head("/heartbeat", include_in_schema=False) def heartbeat(): """API Heartbeat for Load Balancing.""" return { "status": "OK", "time": datetime.utcnow(), "version": self.version, **self.heartbeat_meta, } @app.get("/", include_in_schema=False) def redirect_docs(): """Redirects the root end point to the docs.""" return RedirectResponse(url=app.docs_url, status_code=301) return app def run(self, ip: str = "127.0.0.1", port: int = 8000, log_level: str = "info"): """ Runs the Cluster Manager locally. Args: ip: Local IP to listen on port: Local port to listen on log_level: Logging level for the webserver Returns: None """ uvicorn.run(self.app, host=ip, port=port, log_level=log_level, reload=False) maggma-0.70.0/src/maggma/api/__init__.py000066400000000000000000000000471470132070100177250ustar00rootroot00000000000000"""Simple API Interface for Maggma.""" maggma-0.70.0/src/maggma/api/default_responses.yaml000066400000000000000000000030321470132070100222220ustar00rootroot00000000000000# CITATION: https://gist.github.com/bl4de/3086cf26081110383631 # Table mapping response codes to messages; entries have the 100: description: Continue 101: description: Switching Protocols 200: description: OK 201: description: Created 202: description: Accepted 203: description: Non-Authoritative Information 204: description: No Content 205: description: Reset Content 206: description: Partial Content 300: description: Multiple Choices 301: description: Moved Permanently 302: description: Found 303: description: See Other 304: description: Not Modified 305: description: Use Proxy 307: description: Temporary Redirect 400: description: Bad Request 401: description: Unauthorized 402: description: Payment Required 403: description: Forbidden 404: description: Not Found 405: description: Method Not Allowed 406: description: Not Acceptable 407: description: Proxy Authentication Required 408: description: Request Timeout 409: description: Conflict 410: description: Gone 411: description: Length Required 412: description: Precondition Failed 413: description: Request Entity Too Large 414: description: Request-URI Too Long 415: description: Unsupported Media Type 416: description: Requested Range Not Satisfiable 417: description: Expectation Failed 500: description: Internal Server Error 501: description: Not Implemented 502: description: Bad Gateway 503: description: Service Unavailable 504: description: Gateway Timeout 505: description: HTTP Version Not Supported maggma-0.70.0/src/maggma/api/models.py000066400000000000000000000047421470132070100174570ustar00rootroot00000000000000from datetime import datetime from typing import Generic, Optional, TypeVar from pydantic import BaseModel, Field, validator from maggma import __version__ """ Describes the Materials API Response """ DataT = TypeVar("DataT") class Meta(BaseModel): """ Meta information for the MAPI Response. """ api_version: str = Field( __version__, description="a string containing the version of the Materials API implementation, e.g. v0.9.5", ) time_stamp: datetime = Field( description="a string containing the date and time at which the query was executed", default_factory=datetime.utcnow, ) total_doc: Optional[int] = Field(None, description="the total number of documents available for this query", ge=0) class Config: extra = "allow" class Error(BaseModel): """ Base Error model for General API. """ code: int = Field(..., description="The error code") message: str = Field(..., description="The description of the error") @classmethod def from_traceback(cls, traceback): pass class Response(BaseModel, Generic[DataT]): """ A Generic API Response. """ data: Optional[list[DataT]] = Field(None, description="List of returned data") errors: Optional[list[Error]] = Field(None, description="Any errors on processing this query") meta: Optional[Meta] = Field(None, description="Extra information for the query") @validator("errors", always=True) def check_consistency(cls, v, values): if v is not None and values["data"] is not None: raise ValueError("must not provide both data and error") if v is None and values.get("data") is None: raise ValueError("must provide data or error") return v @validator("meta", pre=True, always=True) def default_meta(cls, v, values): if v is None: v = Meta().dict() if v.get("total_doc", None) is None: if values.get("data", None) is not None: v["total_doc"] = len(values["data"]) else: v["total_doc"] = 0 return v class S3URLDoc(BaseModel): """ S3 pre-signed URL data returned by the S3 URL resource. """ url: str = Field( ..., description="Pre-signed download URL", ) requested_datetime: datetime = Field(..., description="Datetime for when URL was requested") expiry_datetime: datetime = Field(..., description="Expiry datetime of the URL") maggma-0.70.0/src/maggma/api/query_operator/000077500000000000000000000000001470132070100206735ustar00rootroot00000000000000maggma-0.70.0/src/maggma/api/query_operator/__init__.py000066400000000000000000000010641470132070100230050ustar00rootroot00000000000000from maggma.api.query_operator.core import QueryOperator from maggma.api.query_operator.dynamic import NumericQuery, StringQueryOperator from maggma.api.query_operator.pagination import PaginationQuery from maggma.api.query_operator.sorting import SortQuery from maggma.api.query_operator.sparse_fields import SparseFieldsQuery from maggma.api.query_operator.submission import SubmissionQuery __all__ = [ "QueryOperator", "NumericQuery", "StringQueryOperator", "PaginationQuery", "SortQuery", "SparseFieldsQuery", "SubmissionQuery", ] maggma-0.70.0/src/maggma/api/query_operator/core.py000066400000000000000000000017541470132070100222040ustar00rootroot00000000000000from abc import ABCMeta, abstractmethod from monty.json import MSONable from maggma.api.utils import STORE_PARAMS class QueryOperator(MSONable, metaclass=ABCMeta): """ Base Query Operator class for defining powerful query language in the Materials API. """ @abstractmethod def query(self) -> STORE_PARAMS: """ The query function that does the work for this query operator. """ def meta(self) -> dict: """ Returns meta data to return with the Response. Args: store: the Maggma Store that the resource uses query: the query being executed in this API call """ return {} def post_process(self, docs: list[dict], query: dict) -> list[dict]: """ An optional post-processing function for the data. Args: docs: the document results to post-process query: the store query dict to use in post-processing """ return docs maggma-0.70.0/src/maggma/api/query_operator/dynamic.py000066400000000000000000000210661470132070100226760ustar00rootroot00000000000000import inspect from abc import abstractmethod from typing import Any, Callable, Optional, Union from fastapi.params import Query from monty.json import MontyDecoder from pydantic import BaseModel from pydantic.fields import FieldInfo from maggma.api.query_operator import QueryOperator from maggma.api.utils import STORE_PARAMS from maggma.utils import dynamic_import class DynamicQueryOperator(QueryOperator): """Abstract Base class for dynamic query operators.""" def __init__( self, model: type[BaseModel], fields: Optional[list[str]] = None, excluded_fields: Optional[list[str]] = None, ): self.model = model self.fields = fields self.excluded_fields = excluded_fields all_fields: dict[str, FieldInfo] = model.model_fields param_fields = fields or list(set(all_fields.keys()) - set(excluded_fields or [])) # Convert the fields into operator tuples ops = [ op for name, field in all_fields.items() if name in param_fields for op in self.field_to_operator(name, field) ] # Dictionary to make converting the API query names to function that generates # Maggma criteria dictionaries self.mapping = {op[0]: op[3] for op in ops} def query(**kwargs) -> STORE_PARAMS: criteria = [] for k, v in kwargs.items(): if v is not None: try: criteria.append(self.mapping[k](v)) except KeyError: raise KeyError(f"Cannot find key {k} in current query to database mapping") final_crit = {} for entry in criteria: for key, value in entry.items(): if key not in final_crit: final_crit[key] = value else: final_crit[key].update(value) return {"criteria": final_crit} # building the signatures for FastAPI Swagger UI signatures: list = [ inspect.Parameter( op[0], inspect.Parameter.POSITIONAL_OR_KEYWORD, default=op[2], annotation=op[1], ) for op in ops ] query.__signature__ = inspect.Signature(signatures) self.query = query # type: ignore def query(self): """Stub query function for abstract class.""" @abstractmethod def field_to_operator(self, name: str, field: FieldInfo) -> list[tuple[str, Any, Query, Callable[..., dict]]]: """ Converts a PyDantic FieldInfo into a Tuple with the - query param name, - query param type - FastAPI Query object, - and callable to convert the value into a query dict. """ @classmethod def from_dict(cls, d): if isinstance(d["model"], str): d["model"] = dynamic_import(d["model"]) decoder = MontyDecoder() return cls(**{k: decoder.process_decoded(v) for k, v in d.items()}) def as_dict(self) -> dict: """ Special as_dict implemented to convert pydantic models into strings. """ d = super().as_dict() # Ensures sub-classes serialize correctly d["model"] = f"{self.model.__module__}.{self.model.__name__}" # type: ignore return d class NumericQuery(DynamicQueryOperator): """Query Operator to enable searching on numeric fields.""" def field_to_operator(self, name: str, field: FieldInfo) -> list[tuple[str, Any, Query, Callable[..., dict]]]: """ Converts a PyDantic FieldInfo into a Tuple with the query_param name, default value, Query object, and callable to convert it into a query dict. """ ops = [] field_type = field.annotation if field_type in [int, float, Union[float, None], Union[int, None]]: title: str = name or field.alias ops = [ ( f"{title}_max", field_type, Query( default=None, description=f"Query for maximum value of {title}", ), lambda val: {f"{title}": {"$lte": val}}, ), ( f"{title}_min", field_type, Query( default=None, description=f"Query for minimum value of {title}", ), lambda val: {f"{title}": {"$gte": val}}, ), ] if field_type in [int, Union[int, None]]: ops.extend( [ ( f"{title}", field_type, Query( default=None, description=f"Query for {title} being equal to an exact value", ), lambda val: {f"{title}": val}, ), ( f"{title}_not_eq", field_type, Query( default=None, description=f"Query for {title} being not equal to an exact value", ), lambda val: {f"{title}": {"$ne": val}}, ), ( f"{title}_eq_any", str, # type: ignore Query( default=None, description=f"Query for {title} being any of these values. Provide a comma separated list.", ), lambda val: {f"{title}": {"$in": [int(entry.strip()) for entry in val.split(",")]}}, ), ( f"{title}_neq_any", str, # type: ignore Query( default=None, description=f"Query for {title} being not any of these values. \ Provide a comma separated list.", ), lambda val: {f"{title}": {"$nin": [int(entry.strip()) for entry in val.split(",")]}}, ), ] ) return ops class StringQueryOperator(DynamicQueryOperator): """Query Operator to enable searching on numeric fields.""" def field_to_operator(self, name: str, field: FieldInfo) -> list[tuple[str, Any, Query, Callable[..., dict]]]: """ Converts a PyDantic FieldInfo into a Tuple with the query_param name, default value, Query object, and callable to convert it into a query dict. """ ops = [] field_type: type = field.annotation if field_type in [str, Union[str, None]]: title: str = name ops = [ ( f"{title}", field_type, Query( default=None, description=f"Query for {title} being equal to a value", ), lambda val: {f"{title}": val}, ), ( f"{title}_not_eq", field_type, Query( default=None, description=f"Query for {title} being not equal to a value", ), lambda val: {f"{title}": {"$ne": val}}, ), ( f"{title}_eq_any", str, # type: ignore Query( default=None, description=f"Query for {title} being any of these values. Provide a comma separated list.", ), lambda val: {f"{title}": {"$in": [entry.strip() for entry in val.split(",")]}}, ), ( f"{title}_neq_any", str, # type: ignore Query( default=None, description=f"Query for {title} being not any of these values. Provide a comma separated list", ), lambda val: {f"{title}": {"$nin": [entry.strip() for entry in val.split(",")]}}, ), ] return ops maggma-0.70.0/src/maggma/api/query_operator/pagination.py000066400000000000000000000056761470132070100234140ustar00rootroot00000000000000from fastapi import HTTPException, Query from maggma.api.query_operator import QueryOperator from maggma.api.utils import STORE_PARAMS class PaginationQuery(QueryOperator): """Query operators to provides Pagination.""" def __init__(self, default_limit: int = 100, max_limit: int = 1000): """ Args: default_limit: the default number of documents to return max_limit: max number of documents to return. """ self.default_limit = default_limit self.max_limit = max_limit def query( _page: int = Query( None, description="Page number to request (takes precedent over _limit and _skip).", ), _per_page: int = Query( default_limit, description="Number of entries to show per page (takes precedent over _limit and _skip)." f" Limited to {max_limit}.", ), _skip: int = Query( 0, description="Number of entries to skip in the search.", ), _limit: int = Query( default_limit, description=f"Max number of entries to return in a single query. Limited to {max_limit}.", ), ) -> STORE_PARAMS: """ Pagination parameters for the API Endpoint. """ if _page is not None: if _per_page > max_limit: raise HTTPException( status_code=400, detail="Requested more data per query than allowed by this endpoint." f" The max limit is {max_limit} entries", ) if _page < 0 or _per_page < 0: raise HTTPException( status_code=400, detail="Cannot request negative _page or _per_page values", ) return { "skip": ((_page - 1) * _per_page) if _page >= 1 else 0, "limit": _per_page, } else: if _limit > max_limit: raise HTTPException( status_code=400, detail="Requested more data per query than allowed by this endpoint." f" The max limit is {max_limit} entries", ) if _skip < 0 or _limit < 0: raise HTTPException( status_code=400, detail="Cannot request negative _skip or _limit values", ) return {"skip": _skip, "limit": _limit} self.query = query # type: ignore def query(self): """Stub query function for abstract class.""" def meta(self) -> dict: """ Metadata for the pagination params. """ return {"max_limit": self.max_limit} maggma-0.70.0/src/maggma/api/query_operator/sorting.py000066400000000000000000000033561470132070100227410ustar00rootroot00000000000000from typing import Optional from fastapi import Query from fastapi.exceptions import HTTPException from maggma.api.query_operator import QueryOperator from maggma.api.utils import STORE_PARAMS class SortQuery(QueryOperator): """Method to generate the sorting portion of a query.""" def __init__(self, fields: Optional[list[str]] = None, max_num: Optional[int] = None): """Sort query configuration. Args: fields (Optional[List[str]]): List of allowed fields to sort with max_num (Optional[int]): Max number of fields to simultaneously sort with """ self.fields = fields or [] self.max_num = max_num or 0 if self.max_num < 0: raise ValueError("Max number of fields should be larger than 0") def query( self, _sort_fields: Optional[str] = Query( None, description="Comma delimited fields to sort with.\ Prefixing '-' to a field will force a sort in descending order.", ), ) -> STORE_PARAMS: sort = {} if _sort_fields: field_list = _sort_fields.split(",") if self.max_num and len(field_list) > self.max_num: raise HTTPException( status_code=400, detail=f"Please provide at most {self.max_num} field(s) to sort with" ) for sort_field in field_list: query_entry = {sort_field: 1} if sort_field.startswith("-"): query_entry = {sort_field[1:]: -1} sort_field = sort_field[1:] if self.fields and sort_field not in self.fields: continue sort.update(query_entry) return {"sort": sort} maggma-0.70.0/src/maggma/api/query_operator/sparse_fields.py000066400000000000000000000046721470132070100241010ustar00rootroot00000000000000from typing import Optional from fastapi import Query from pydantic import BaseModel from maggma.api.query_operator import QueryOperator from maggma.api.utils import STORE_PARAMS from maggma.utils import dynamic_import class SparseFieldsQuery(QueryOperator): def __init__(self, model: type[BaseModel], default_fields: Optional[list[str]] = None): """ Args: model: PyDantic Model that represents the underlying data source default_fields: default fields to return in the API response if no fields are explicitly requested. """ self.model = model model_name = self.model.__name__ # type: ignore model_fields = list(self.model.__fields__.keys()) self.default_fields = model_fields if default_fields is None else list(default_fields) def query( _fields: str = Query( None, description=f"Fields to project from {model_name!s} as a list of comma separated strings.\ Fields include: `{'` `'.join(model_fields)}`", ), _all_fields: bool = Query(False, description="Include all fields."), ) -> STORE_PARAMS: """ Pagination parameters for the API Endpoint. """ properties = _fields.split(",") if isinstance(_fields, str) else self.default_fields if _all_fields: properties = model_fields return {"properties": properties} self.query = query # type: ignore def query(self): """Stub query function for abstract class.""" def meta(self) -> dict: """ Returns metadata for the Sparse field set. """ return {"default_fields": self.default_fields} def as_dict(self) -> dict: """ Special as_dict implemented to convert pydantic models into strings. """ d = super().as_dict() # Ensures sub-classes serialize correctly d["model"] = f"{self.model.__module__}.{self.model.__name__}" # type: ignore return d @classmethod def from_dict(cls, d): """ Special from_dict to autoload the pydantic model from the location string. """ model = d.get("model") if isinstance(model, str): model = dynamic_import(model) assert issubclass(model, BaseModel), "The resource model has to be a PyDantic Model" d["model"] = model return cls(**d) maggma-0.70.0/src/maggma/api/query_operator/submission.py000066400000000000000000000024441470132070100234440ustar00rootroot00000000000000from datetime import datetime from typing import Optional from fastapi import Query from maggma.api.query_operator import QueryOperator from maggma.api.utils import STORE_PARAMS class SubmissionQuery(QueryOperator): """ Method to generate a query for submission data using status and datetime. """ def __init__(self, status_enum): self.status_enum = status_enum def query( state: Optional[status_enum] = Query(None, description="Latest status of the submission"), last_updated: Optional[datetime] = Query( None, description="Minimum datetime of status update for submission", ), ) -> STORE_PARAMS: crit = {} # type: dict if state: s_dict = {"$expr": {"$eq": [{"$arrayElemAt": ["$state", -1]}, state.value]}} # type: ignore crit.update(s_dict) if last_updated: l_dict = {"$expr": {"$gt": [{"$arrayElemAt": ["$last_updated", -1]}, last_updated]}} crit.update(l_dict) if state and last_updated: crit = {"$and": [s_dict, l_dict]} return {"criteria": crit} self.query = query def query(self): """Stub query function for abstract class.""" maggma-0.70.0/src/maggma/api/resource/000077500000000000000000000000001470132070100174425ustar00rootroot00000000000000maggma-0.70.0/src/maggma/api/resource/__init__.py000066400000000000000000000012031470132070100215470ustar00rootroot00000000000000# isort: off from maggma.api.resource.core import HeaderProcessor, HintScheme, Resource # isort: on from maggma.api.resource.aggregation import AggregationResource from maggma.api.resource.post_resource import PostOnlyResource from maggma.api.resource.read_resource import ReadOnlyResource, attach_query_ops from maggma.api.resource.s3_url import S3URLResource from maggma.api.resource.submission import SubmissionResource __all__ = [ "Resource", "HintScheme", "HeaderProcessor", "AggregationResource", "PostOnlyResource", "ReadOnlyResource", "attach_query_ops", "SubmissionResource", "S3URLResource", ] maggma-0.70.0/src/maggma/api/resource/aggregation.py000066400000000000000000000100101470132070100222730ustar00rootroot00000000000000from typing import Any, Optional import orjson from fastapi import HTTPException, Request, Response from pydantic import BaseModel from pymongo import timeout as query_timeout from pymongo.errors import NetworkTimeout, PyMongoError from maggma.api.models import Meta from maggma.api.models import Response as ResponseModel from maggma.api.query_operator import QueryOperator from maggma.api.resource import HeaderProcessor, Resource from maggma.api.resource.utils import attach_query_ops from maggma.api.utils import STORE_PARAMS, merge_queries, serialization_helper from maggma.core import Store class AggregationResource(Resource): """ Implements a REST Compatible Resource as a GET URL endpoint. """ def __init__( self, store: Store, model: type[BaseModel], pipeline_query_operator: QueryOperator, timeout: Optional[int] = None, tags: Optional[list[str]] = None, include_in_schema: Optional[bool] = True, sub_path: Optional[str] = "/", header_processor: Optional[HeaderProcessor] = None, ): """ Args: store: The Maggma Store to get data from model: The pydantic model this Resource represents tags: List of tags for the Endpoint pipeline_query_operator: Operator for the aggregation pipeline timeout: Time in seconds Pymongo should wait when querying MongoDB before raising a timeout error include_in_schema: Whether the endpoint should be shown in the documented schema. sub_path: sub-URL path for the resource. """ self.store = store self.tags = tags or [] self.include_in_schema = include_in_schema self.sub_path = sub_path self.response_model = ResponseModel[model] # type: ignore self.pipeline_query_operator = pipeline_query_operator self.header_processor = header_processor self.timeout = timeout super().__init__(model) def prepare_endpoint(self): """ Internal method to prepare the endpoint by setting up default handlers for routes. """ self.build_dynamic_model_search() def build_dynamic_model_search(self): model_name = self.model.__name__ def search(**queries: dict[str, STORE_PARAMS]) -> dict: request: Request = queries.pop("request") # type: ignore queries.pop("temp_response") # type: ignore query: dict[Any, Any] = merge_queries(list(queries.values())) # type: ignore self.store.connect() try: with query_timeout(self.timeout): data = list(self.store._collection.aggregate(query["pipeline"])) except (NetworkTimeout, PyMongoError) as e: if e.timeout: raise HTTPException( status_code=504, detail="Server timed out trying to obtain data. Try again with a smaller request.", ) else: raise HTTPException( status_code=500, ) count = len(data) data = self.pipeline_query_operator.post_process(data, query) operator_meta = self.pipeline_query_operator.meta() meta = Meta(total_doc=count) response = {"data": data, "meta": {**meta.dict(), **operator_meta}} response = Response(orjson.dumps(response, default=serialization_helper)) # type: ignore if self.header_processor is not None: self.header_processor.process_header(response, request) return response self.router.get( self.sub_path, tags=self.tags, summary=f"Get {model_name} documents", response_model=self.response_model, response_description=f"Get {model_name} data", response_model_exclude_unset=True, )(attach_query_ops(search, [self.pipeline_query_operator])) maggma-0.70.0/src/maggma/api/resource/core.py000066400000000000000000000067721470132070100207600ustar00rootroot00000000000000import logging from abc import ABCMeta, abstractmethod from fastapi import APIRouter, FastAPI, Request, Response from monty.json import MontyDecoder, MSONable from pydantic import BaseModel from starlette.responses import RedirectResponse from maggma.api.query_operator import QueryOperator from maggma.api.utils import STORE_PARAMS, api_sanitize from maggma.utils import dynamic_import class Resource(MSONable, metaclass=ABCMeta): """ Base class for a REST Compatible Resource. """ def __init__( self, model: type[BaseModel], ): """ Args: model: the pydantic model this Resource represents. """ if not issubclass(model, BaseModel): raise ValueError("The resource model has to be a PyDantic Model") self.model = api_sanitize(model, allow_dict_msonable=True) self.logger = logging.getLogger(type(self).__name__) self.logger.addHandler(logging.NullHandler()) self.router = APIRouter() self.prepare_endpoint() self.setup_redirect() def on_startup(self): """ Callback to perform some work on resource initialization. """ @abstractmethod def prepare_endpoint(self): """ Internal method to prepare the endpoint by setting up default handlers for routes. """ def setup_redirect(self): @self.router.get("$", include_in_schema=False) def redirect_unslashed(): """ Redirects unforward slashed url to resource url with the forward slash. """ url = self.router.url_path_for("/") return RedirectResponse(url=url, status_code=301) def run(self): # pragma: no cover """ Runs the Endpoint cluster locally This is intended for testing not production. """ import uvicorn app = FastAPI() app.include_router(self.router, prefix="") uvicorn.run(app) def as_dict(self) -> dict: """ Special as_dict implemented to convert pydantic models into strings. """ d = super().as_dict() # Ensures sub-classes serialize correctly d["model"] = f"{self.model.__module__}.{self.model.__name__}" return d @classmethod def from_dict(cls, d: dict): if isinstance(d["model"], str): d["model"] = dynamic_import(d["model"]) d = {k: MontyDecoder().process_decoded(v) for k, v in d.items()} return cls(**d) class HintScheme(MSONable, metaclass=ABCMeta): """ Base class for generic hint schemes generation. """ @abstractmethod def generate_hints(self, query: STORE_PARAMS) -> STORE_PARAMS: """ This method takes in a MongoDB query and returns hints. """ class HeaderProcessor(MSONable, metaclass=ABCMeta): """ Base class for generic header processing. """ @abstractmethod def process_header(self, response: Response, request: Request): """ This method takes in a FastAPI Response object and processes a new header for it in-place. It can use data in the upstream request to generate the header. (https://fastapi.tiangolo.com/advanced/response-headers/#use-a-response-parameter). """ @abstractmethod def configure_query_on_request(self, request: Request, query_operator: QueryOperator) -> STORE_PARAMS: """ This method takes in a FastAPI Request object and returns a query to be used in the store. """ maggma-0.70.0/src/maggma/api/resource/post_resource.py000066400000000000000000000131401470132070100227070ustar00rootroot00000000000000from inspect import signature from typing import Any, Optional from fastapi import HTTPException, Request from pydantic import BaseModel from pymongo import timeout as query_timeout from pymongo.errors import NetworkTimeout, PyMongoError from maggma.api.models import Meta, Response from maggma.api.query_operator import PaginationQuery, QueryOperator, SparseFieldsQuery from maggma.api.resource import Resource from maggma.api.resource.utils import attach_query_ops, generate_query_pipeline from maggma.api.utils import STORE_PARAMS, merge_queries from maggma.core import Store from maggma.stores import S3Store class PostOnlyResource(Resource): """ Implements a REST Compatible Resource as a POST URL endpoint. """ def __init__( self, store: Store, model: type[BaseModel], tags: Optional[list[str]] = None, query_operators: Optional[list[QueryOperator]] = None, key_fields: Optional[list[str]] = None, query: Optional[dict] = None, timeout: Optional[int] = None, include_in_schema: Optional[bool] = True, sub_path: Optional[str] = "/", ): """ Args: store: The Maggma Store to get data from model: The pydantic model this Resource represents tags: List of tags for the Endpoint query_operators: Operators for the query language key_fields: List of fields to always project. Default uses SparseFieldsQuery to allow user to define these on-the-fly. timeout: Time in seconds Pymongo should wait when querying MongoDB before raising a timeout error include_in_schema: Whether the endpoint should be shown in the documented schema. sub_path: sub-URL path for the resource. """ self.store = store self.tags = tags or [] self.query = query or {} self.key_fields = key_fields self.versioned = False self.timeout = timeout self.include_in_schema = include_in_schema self.sub_path = sub_path self.response_model = Response[model] # type: ignore self.query_operators = ( query_operators if query_operators is not None else [ PaginationQuery(), SparseFieldsQuery( model, default_fields=[self.store.key, self.store.last_updated_field], ), ] ) super().__init__(model) def prepare_endpoint(self): """ Internal method to prepare the endpoint by setting up default handlers for routes. """ self.build_dynamic_model_search() def build_dynamic_model_search(self): model_name = self.model.__name__ def search(**queries: dict[str, STORE_PARAMS]) -> dict: request: Request = queries.pop("request") # type: ignore queries.pop("temp_response") # type: ignore query_params = [ entry for _, i in enumerate(self.query_operators) for entry in signature(i.query).parameters ] overlap = [key for key in request.query_params if key not in query_params] if any(overlap): raise HTTPException( status_code=400, detail="Request contains query parameters which cannot be used: {}".format(", ".join(overlap)), ) query: dict[Any, Any] = merge_queries(list(queries.values())) # type: ignore query["criteria"].update(self.query) self.store.connect() try: with query_timeout(self.timeout): count = self.store.count( # type: ignore **{field: query[field] for field in query if field in ["criteria", "hint"]} ) if isinstance(self.store, S3Store): data = list(self.store.query(**query)) # type: ignore else: pipeline = generate_query_pipeline(query, self.store) data = list( self.store._collection.aggregate( pipeline, **{field: query[field] for field in query if field in ["hint"]}, ) ) except (NetworkTimeout, PyMongoError) as e: if e.timeout: raise HTTPException( status_code=504, detail="Server timed out trying to obtain data. Try again with a smaller request.", ) else: raise HTTPException( status_code=500, detail="Server timed out trying to obtain data. Try again with a smaller request, " "or remove sorting fields and sort data locally.", ) operator_meta = {} for operator in self.query_operators: data = operator.post_process(data, query) operator_meta.update(operator.meta()) meta = Meta(total_doc=count) return {"data": data, "meta": {**meta.dict(), **operator_meta}} self.router.post( self.sub_path, tags=self.tags, summary=f"Post {model_name} documents", response_model=self.response_model, response_description=f"Post {model_name} data", response_model_exclude_unset=True, )(attach_query_ops(search, self.query_operators)) maggma-0.70.0/src/maggma/api/resource/read_resource.py000066400000000000000000000277141470132070100226510ustar00rootroot00000000000000from inspect import signature from typing import Any, Optional, Union import orjson from fastapi import Depends, HTTPException, Path, Request, Response from pydantic import BaseModel from pymongo import timeout as query_timeout from pymongo.errors import NetworkTimeout, PyMongoError from maggma.api.models import Meta from maggma.api.models import Response as ResponseModel from maggma.api.query_operator import PaginationQuery, QueryOperator, SparseFieldsQuery from maggma.api.resource import HeaderProcessor, HintScheme, Resource from maggma.api.resource.utils import attach_query_ops, generate_query_pipeline from maggma.api.utils import STORE_PARAMS, merge_queries, serialization_helper from maggma.core import Store from maggma.stores import MongoStore, S3Store class ReadOnlyResource(Resource): """ Implements a REST Compatible Resource as a GET URL endpoint This class provides a number of convenience features including full pagination, field projection. """ def __init__( self, store: Store, model: type[BaseModel], tags: Optional[list[str]] = None, query_operators: Optional[list[QueryOperator]] = None, key_fields: Optional[list[str]] = None, hint_scheme: Optional[HintScheme] = None, header_processor: Optional[HeaderProcessor] = None, query_to_configure_on_request: Optional[QueryOperator] = None, timeout: Optional[int] = None, enable_get_by_key: bool = False, enable_default_search: bool = True, disable_validation: bool = False, query_disk_use: bool = False, include_in_schema: Optional[bool] = True, sub_path: Optional[str] = "/", ): """ Args: store: The Maggma Store to get data from model: The pydantic model this Resource represents tags: List of tags for the Endpoint query_operators: Operators for the query language hint_scheme: The hint scheme to use for this resource header_processor: The header processor to use for this resource query_to_configure_on_request: Query operator to configure on request timeout: Time in seconds Pymongo should wait when querying MongoDB before raising a timeout error key_fields: List of fields to always project. Default uses SparseFieldsQuery to allow user to define these on-the-fly. enable_get_by_key: Enable get by key route for endpoint. enable_default_search: Enable default endpoint search behavior. query_disk_use: Whether to use temporary disk space in large MongoDB queries. disable_validation: Whether to use ORJSON and provide a direct FastAPI response. Note this will disable auto JSON serialization and response validation with the provided model. include_in_schema: Whether the endpoint should be shown in the documented schema. sub_path: sub-URL path for the resource. """ self.store = store self.tags = tags or [] self.hint_scheme = hint_scheme self.header_processor = header_processor self.query_to_configure_on_request = query_to_configure_on_request self.key_fields = key_fields self.versioned = False self.enable_get_by_key = enable_get_by_key self.enable_default_search = enable_default_search self.timeout = timeout self.disable_validation = disable_validation self.include_in_schema = include_in_schema self.sub_path = sub_path self.query_disk_use = query_disk_use self.response_model = ResponseModel[model] # type: ignore if not isinstance(store, MongoStore) and self.hint_scheme is not None: raise ValueError("Hint scheme is only supported for MongoDB stores") self.query_operators = ( query_operators if query_operators is not None else [ PaginationQuery(), SparseFieldsQuery( model, default_fields=[self.store.key, self.store.last_updated_field], ), ] ) super().__init__(model) def prepare_endpoint(self): """ Internal method to prepare the endpoint by setting up default handlers for routes. """ if self.enable_get_by_key: self.build_get_by_key() if self.enable_default_search: self.build_dynamic_model_search() def build_get_by_key(self): key_name = self.store.key model_name = self.model.__name__ if self.key_fields is None: field_input = SparseFieldsQuery(self.model, [self.store.key, self.store.last_updated_field]).query else: def field_input(): return {"properties": self.key_fields} def get_by_key( request: Request, temp_response: Response, key: str = Path( ..., alias=key_name, title=f"The {key_name} of the {model_name} to get", ), _fields: STORE_PARAMS = Depends(field_input), ): f""" Gets a document by the primary key in the store Args: {key_name}: the id of a single {model_name} Returns: a single {model_name} document """ self.store.connect() try: with query_timeout(self.timeout): item = [ self.store.query_one( criteria={self.store.key: key}, properties=_fields["properties"], ) ] except (NetworkTimeout, PyMongoError) as e: if e.timeout: raise HTTPException( status_code=504, detail="Server timed out trying to obtain data. Try again with a smaller request.", ) else: raise HTTPException( status_code=500, ) if item == [None]: raise HTTPException( status_code=404, detail=f"Item with {self.store.key} = {key} not found", ) for operator in self.query_operators: item = operator.post_process(item, {}) response = {"data": item} # type: ignore if self.disable_validation: response = Response(orjson.dumps(response, default=serialization_helper)) # type: ignore if self.header_processor is not None: if self.disable_validation: self.header_processor.process_header(response, request) else: self.header_processor.process_header(temp_response, request) return response self.router.get( f"{self.sub_path}{{{key_name}}}/", summary=f"Get a {model_name} document by by {key_name}", response_description=f"Get a {model_name} document by {key_name}", response_model=self.response_model, response_model_exclude_unset=True, tags=self.tags, include_in_schema=self.include_in_schema, )(get_by_key) def build_dynamic_model_search(self): model_name = self.model.__name__ def search(**queries: dict[str, STORE_PARAMS]) -> Union[dict, Response]: request: Request = queries.pop("request") # type: ignore temp_response: Response = queries.pop("temp_response") # type: ignore if self.query_to_configure_on_request is not None: # give the key name "request", arbitrary choice, as only the value gets merged into the query queries["groups"] = self.header_processor.configure_query_on_request( request=request, query_operator=self.query_to_configure_on_request ) # allowed query parameters query_params = [ entry for _, i in enumerate(self.query_operators) for entry in signature(i.query).parameters ] # check for overlap between allowed query parameters and request query parameters overlap = [key for key in request.query_params if key not in query_params] if any(overlap): if "limit" in overlap or "skip" in overlap: raise HTTPException( status_code=400, detail="'limit' and 'skip' parameters have been renamed. " "Please update your API client to the newest version.", ) else: raise HTTPException( status_code=400, detail="Request contains query parameters which cannot be used: {}".format(", ".join(overlap)), ) query: dict[Any, Any] = merge_queries(list(queries.values())) # type: ignore if self.hint_scheme is not None: # pragma: no cover hints = self.hint_scheme.generate_hints(query) query.update(hints) self.store.connect() try: with query_timeout(self.timeout): if isinstance(self.store, S3Store): count = self.store.count(criteria=query.get("criteria")) # type: ignore if self.query_disk_use: data = list(self.store.query(**query, allow_disk_use=True)) # type: ignore else: data = list(self.store.query(**query)) else: count = self.store.count( criteria=query.get("criteria"), hint=query.get("count_hint") ) # type: ignore pipeline = generate_query_pipeline(query, self.store) agg_kwargs = {} if query.get("agg_hint"): agg_kwargs["hint"] = query["agg_hint"] data = list(self.store._collection.aggregate(pipeline, **agg_kwargs)) except (NetworkTimeout, PyMongoError) as e: if e.timeout: raise HTTPException( status_code=504, detail="Server timed out trying to obtain data. Try again with a smaller request.", ) else: raise HTTPException( status_code=500, detail="Server timed out trying to obtain data. Try again with a smaller request," " or remove sorting fields and sort data locally.", ) operator_meta = {} for operator in self.query_operators: data = operator.post_process(data, query) operator_meta.update(operator.meta()) meta = Meta(total_doc=count) response = {"data": data, "meta": {**meta.dict(), **operator_meta}} # type: ignore if self.disable_validation: response = Response(orjson.dumps(response, default=serialization_helper)) # type: ignore if self.header_processor is not None: if self.disable_validation: self.header_processor.process_header(response, request) else: self.header_processor.process_header(temp_response, request) return response self.router.get( self.sub_path, tags=self.tags, summary=f"Get {model_name} documents", response_model=self.response_model, response_description=f"Search for a {model_name}", response_model_exclude_unset=True, )(attach_query_ops(search, self.query_operators)) maggma-0.70.0/src/maggma/api/resource/s3_url.py000066400000000000000000000115551470132070100212320ustar00rootroot00000000000000from datetime import datetime, timedelta from typing import Optional import orjson from botocore.exceptions import ClientError from fastapi import HTTPException, Path, Request, Response from maggma.api.models import Response as ResponseModel from maggma.api.models import S3URLDoc from maggma.api.resource import HeaderProcessor, Resource from maggma.api.utils import serialization_helper from maggma.stores.aws import S3Store class S3URLResource(Resource): """ Implements a REST Compatible Resource as a GET URL endpoint that provides pre-signed S3 URLs. """ def __init__( self, store: S3Store, url_lifetime: int, tags: Optional[list[str]] = None, header_processor: Optional[HeaderProcessor] = None, disable_validation: bool = False, include_in_schema: Optional[bool] = True, sub_path: Optional[str] = "/", ): """ Args: store: The Maggma Store to get data from url_lifetime: URL lifetime in seconds header_processor: The header processor to use for this resource disable_validation: Whether to use ORJSON and provide a direct FastAPI response. Note this will disable auto JSON serialization and response validation with the provided model. include_in_schema: Whether the endpoint should be shown in the documented schema. sub_path: sub-URL path for the resource. """ self.store = store self.url_lifetime = url_lifetime self.tags = tags or [] self.header_processor = header_processor self.disable_validation = disable_validation self.include_in_schema = include_in_schema self.sub_path = sub_path self.response_model = ResponseModel[S3URLDoc] # type: ignore super().__init__(S3URLDoc) def prepare_endpoint(self): """ Internal method to prepare the endpoint by setting up default handlers for routes. """ self.build_get_by_key() def build_get_by_key(self): key_name = self.store.key model_name = self.model.__name__ def get_by_key( request: Request, temp_response: Response, key: str = Path( ..., alias=key_name, title=f"The {key_name} of the {model_name} to get", ), ): f""" Gets a document by the primary key in the store Args: {key_name}: the id of a single {model_name} Returns: A single pre-signed URL {model_name} document """ self.store.connect() if self.store.sub_dir is not None: key = self.store.sub_dir.strip("/") + "/" + key # Make sure object is in bucket try: self.store.s3.Object(self.store.bucket, key).load() except ClientError: raise HTTPException( status_code=404, detail="No object found for {} = {}".format(self.store.key, key.split("/")[-1]), ) # Get URL try: url = self.store.s3.meta.client.generate_presigned_url( ClientMethod="get_object", Params={"Bucket": self.store.bucket, "Key": key}, ExpiresIn=self.url_lifetime, ) except Exception: raise HTTPException( status_code=404, detail="Problem obtaining URL for {} = {}".format(self.store.key, key.split("/")[-1]), ) requested_datetime = datetime.utcnow() expiry_datetime = requested_datetime + timedelta(seconds=self.url_lifetime) item = S3URLDoc( url=url, requested_datetime=requested_datetime, expiry_datetime=expiry_datetime, ) response = {"data": [item.dict()]} # type: ignore if self.disable_validation: response = Response(orjson.dumps(response, default=serialization_helper)) # type: ignore if self.header_processor is not None: if self.disable_validation: self.header_processor.process_header(response, request) else: self.header_processor.process_header(temp_response, request) return response self.router.get( f"{self.sub_path}{{{key_name}}}/", summary=f"Get a {model_name} document by by {key_name}", response_description=f"Get a {model_name} document by {key_name}", response_model=self.response_model, response_model_exclude_unset=True, tags=self.tags, include_in_schema=self.include_in_schema, )(get_by_key) maggma-0.70.0/src/maggma/api/resource/submission.py000066400000000000000000000361271470132070100222200ustar00rootroot00000000000000from datetime import datetime from enum import Enum from inspect import signature from typing import Any, Optional from uuid import uuid4 from fastapi import HTTPException, Path, Request from pydantic import BaseModel, Field, create_model from pymongo import timeout as query_timeout from pymongo.errors import NetworkTimeout, PyMongoError from maggma.api.models import Meta, Response from maggma.api.query_operator import QueryOperator, SubmissionQuery from maggma.api.resource import Resource from maggma.api.resource.utils import attach_query_ops, generate_query_pipeline from maggma.api.utils import STORE_PARAMS, merge_queries from maggma.core import Store from maggma.stores import S3Store class SubmissionResource(Resource): """ Implements a REST Compatible Resource as POST and/or GET and/or PATCH URL endpoints for submitted data. """ def __init__( self, store: Store, model: type[BaseModel], post_query_operators: list[QueryOperator], get_query_operators: list[QueryOperator], patch_query_operators: Optional[list[QueryOperator]] = None, tags: Optional[list[str]] = None, timeout: Optional[int] = None, include_in_schema: Optional[bool] = True, duplicate_fields_check: Optional[list[str]] = None, enable_default_search: Optional[bool] = True, state_enum: Optional[Enum] = None, default_state: Optional[Any] = None, calculate_submission_id: Optional[bool] = False, get_sub_path: Optional[str] = "/", post_sub_path: Optional[str] = "/", patch_sub_path: Optional[str] = "/", ): """ Args: store: The Maggma Store to get data from model: The pydantic model this resource represents tags: List of tags for the Endpoint timeout: Time in seconds Pymongo should wait when querying MongoDB before raising a timeout error post_query_operators: Operators for the query language for post data get_query_operators: Operators for the query language for get data patch_query_operators: Operators for the query language for patch data include_in_schema: Whether to include the submission resource in the documented schema duplicate_fields_check: Fields in model used to check for duplicates for POST data enable_default_search: Enable default endpoint search behavior. state_enum: State Enum defining possible data states default_state: Default state value in provided state Enum calculate_submission_id: Whether to calculate and use a submission ID as primary data key. If False, the store key is used instead. get_sub_path: GET sub-URL path for the resource. post_sub_path: POST sub-URL path for the resource. patch_sub_path: PATCH sub-URL path for the resource. """ if isinstance(state_enum, Enum) and default_state not in [entry.value for entry in state_enum]: # type: ignore raise RuntimeError("If data is stateful a state enum and valid default value must be provided") self.state_enum = state_enum self.default_state = default_state self.store = store self.tags = tags or [] self.timeout = timeout self.post_query_operators = post_query_operators self.get_query_operators = ( [op for op in get_query_operators if op is not None] + [SubmissionQuery(state_enum)] # type: ignore if state_enum is not None else get_query_operators ) self.patch_query_operators = patch_query_operators self.include_in_schema = include_in_schema self.duplicate_fields_check = duplicate_fields_check self.enable_default_search = enable_default_search self.calculate_submission_id = calculate_submission_id self.get_sub_path = get_sub_path self.post_sub_path = post_sub_path self.patch_sub_path = patch_sub_path new_fields = {} # type: dict if self.calculate_submission_id: new_fields["submission_id"] = ( str, Field(..., description="Unique submission ID"), ) if state_enum is not None: new_fields["state"] = ( list[state_enum], # type: ignore Field(..., description="List of data status descriptions"), ) new_fields["updated"] = ( list[datetime], Field(..., description="List of status update datetimes"), ) if new_fields: model = create_model(model.__name__, __base__=model, **new_fields) self.response_model = Response[model] # type: ignore super().__init__(model) def prepare_endpoint(self): """ Internal method to prepare the endpoint by setting up default handlers for routes. """ if self.enable_default_search: self.build_search_data() self.build_get_by_key() self.build_post_data() if self.patch_query_operators: self.build_patch_data() def build_get_by_key(self): model_name = self.model.__name__ key_name = "submission_id" if self.calculate_submission_id else self.store.key def get_by_key( key: str = Path( ..., alias=key_name, description=f"The {key_name} of the {model_name} to get", ), ): f""" Get a document using the {key_name} Args: {key_name}: the id of a single {model_name} Returns: a single {model_name} document """ self.store.connect() crit = {key_name: key} try: with query_timeout(self.timeout): item = [self.store.query_one(criteria=crit)] except (NetworkTimeout, PyMongoError) as e: if e.timeout: raise HTTPException( status_code=504, detail="Server timed out trying to obtain data. Try again with a smaller request.", ) else: raise HTTPException(status_code=500) if item == [None]: raise HTTPException( status_code=404, detail=f"Item with submission ID = {key} not found", ) for operator in self.get_query_operators: # type: ignore item = operator.post_process(item, {}) return {"data": item} self.router.get( f"{self.get_sub_path}{{{key_name}}}/", response_description=f"Get an {model_name} by {key_name}", response_model=self.response_model, response_model_exclude_unset=True, tags=self.tags, include_in_schema=self.include_in_schema, )(get_by_key) def build_search_data(self): model_name = self.model.__name__ def search(**queries: STORE_PARAMS): request: Request = queries.pop("request") # type: ignore queries.pop("temp_response") # type: ignore query: STORE_PARAMS = merge_queries(list(queries.values())) query_params = [ entry for _, i in enumerate(self.get_query_operators) # type: ignore for entry in signature(i.query).parameters ] overlap = [key for key in request.query_params if key not in query_params] if any(overlap): raise HTTPException( status_code=404, detail="Request contains query parameters which cannot be used: {}".format(", ".join(overlap)), ) self.store.connect(force_reset=True) try: with query_timeout(self.timeout): count = self.store.count( # type: ignore **{field: query[field] for field in query if field in ["criteria", "hint"]} ) if isinstance(self.store, S3Store): data = list(self.store.query(**query)) # type: ignore else: pipeline = generate_query_pipeline(query, self.store) data = list( self.store._collection.aggregate( pipeline, **{field: query[field] for field in query if field in ["hint"]}, ) ) except (NetworkTimeout, PyMongoError) as e: if e.timeout: raise HTTPException( status_code=504, detail="Server timed out trying to obtain data. Try again with a smaller request.", ) else: raise HTTPException( status_code=500, detail="Server timed out trying to obtain data. Try again with a smaller request, " "or remove sorting fields and sort data locally.", ) meta = Meta(total_doc=count) for operator in self.get_query_operators: # type: ignore data = operator.post_process(data, query) return {"data": data, "meta": meta.dict()} self.router.get( self.get_sub_path, tags=self.tags, summary=f"Get {model_name} data", response_model=self.response_model, response_description="Search for {model_name} data", response_model_exclude_unset=True, include_in_schema=self.include_in_schema, )(attach_query_ops(search, self.get_query_operators)) def build_post_data(self): model_name = self.model.__name__ def post_data(**queries: STORE_PARAMS): request: Request = queries.pop("request") # type: ignore queries.pop("temp_response") # type: ignore query: STORE_PARAMS = merge_queries(list(queries.values())) query_params = [ entry for _, i in enumerate(self.post_query_operators) # type: ignore for entry in signature(i.query).parameters ] overlap = [key for key in request.query_params if key not in query_params] if any(overlap): raise HTTPException( status_code=404, detail="Request contains query parameters which cannot be used: {}".format(", ".join(overlap)), ) self.store.connect(force_reset=True) # Check for duplicate entry if self.duplicate_fields_check: duplicate = self.store.query_one( criteria={field: query["criteria"][field] for field in self.duplicate_fields_check} ) if duplicate: raise HTTPException( status_code=400, detail="Submission already exists. Duplicate data found for fields: {}".format( ", ".join(self.duplicate_fields_check) ), ) if self.calculate_submission_id: query["criteria"]["submission_id"] = str(uuid4()) if self.state_enum is not None: query["criteria"]["state"] = [self.default_state] query["criteria"]["updated"] = [datetime.utcnow()] try: self.store.update(docs=query["criteria"]) # type: ignore except Exception: raise HTTPException( status_code=400, detail="Problem when trying to post data.", ) return { "data": query["criteria"], "meta": "Submission successful", } self.router.post( self.post_sub_path, tags=self.tags, summary=f"Post {model_name} data", response_model=None, response_description=f"Post {model_name} data", response_model_exclude_unset=True, include_in_schema=self.include_in_schema, )(attach_query_ops(post_data, self.post_query_operators)) def build_patch_data(self): model_name = self.model.__name__ def patch_data(**queries: STORE_PARAMS): request: Request = queries.pop("request") # type: ignore queries.pop("temp_response") # type: ignore query: STORE_PARAMS = merge_queries(list(queries.values())) query_params = [ entry for _, i in enumerate(self.patch_query_operators) # type: ignore for entry in signature(i.query).parameters ] overlap = [key for key in request.query_params if key not in query_params] if any(overlap): raise HTTPException( status_code=404, detail="Request contains query parameters which cannot be used: {}".format(", ".join(overlap)), ) self.store.connect(force_reset=True) # Check for duplicate entry if self.duplicate_fields_check: duplicate = self.store.query_one( criteria={field: query["criteria"][field] for field in self.duplicate_fields_check} ) if duplicate: raise HTTPException( status_code=400, detail="Submission already exists. Duplicate data found for fields: {}".format( ", ".join(self.duplicate_fields_check) ), ) if self.calculate_submission_id: query["criteria"]["submission_id"] = str(uuid4()) if self.state_enum is not None: query["criteria"]["state"] = [self.default_state] query["criteria"]["updated"] = [datetime.utcnow()] if query.get("update"): try: self.store._collection.update_one( filter=query["criteria"], update={"$set": query["update"]}, upsert=False, ) except Exception: raise HTTPException( status_code=400, detail="Problem when trying to patch data.", ) return { "data": query["update"], "meta": "Submission successful", } self.router.patch( self.patch_sub_path, tags=self.tags, summary=f"Patch {model_name} data", response_model=None, response_description=f"Patch {model_name} data", response_model_exclude_unset=True, include_in_schema=self.include_in_schema, )(attach_query_ops(patch_data, self.patch_query_operators)) maggma-0.70.0/src/maggma/api/resource/utils.py000066400000000000000000000034031470132070100211540ustar00rootroot00000000000000from typing import Callable from fastapi import Depends, Request, Response from maggma.api.query_operator import QueryOperator from maggma.api.utils import STORE_PARAMS, attach_signature from maggma.core.store import Store def attach_query_ops( function: Callable[[list[STORE_PARAMS]], dict], query_ops: list[QueryOperator] ) -> Callable[[list[STORE_PARAMS]], dict]: """ Attach query operators to API compliant function The function has to take a list of STORE_PARAMs as the only argument. Args: function: the function to decorate """ attach_signature( function, annotations={ **{f"dep{i}": STORE_PARAMS for i, _ in enumerate(query_ops)}, "request": Request, "temp_response": Response, }, defaults={f"dep{i}": Depends(dep.query) for i, dep in enumerate(query_ops)}, ) return function def generate_query_pipeline(query: dict, store: Store): """ Generate the generic aggregation pipeline used in GET endpoint queries. Args: query: Query parameters store: Store containing endpoint data """ pipeline = [ {"$match": query["criteria"]}, ] sorting = query.get("sort", False) if sorting: sort_dict = {"$sort": {}} # type: dict sort_dict["$sort"].update(query["sort"]) projection_dict = {"_id": 0} # Do not return _id by default if query.get("properties", False): projection_dict.update({p: 1 for p in query["properties"]}) if sorting: pipeline.append(sort_dict) pipeline.append({"$project": projection_dict}) pipeline.append({"$skip": query.get("skip", 0)}) if query.get("limit", False): pipeline.append({"$limit": query["limit"]}) return pipeline maggma-0.70.0/src/maggma/api/utils.py000066400000000000000000000124041470132070100173260ustar00rootroot00000000000000import base64 import inspect from typing import ( Any, Callable, Literal, Optional, Union, get_args, # pragma: no cover ) from bson.objectid import ObjectId from monty.json import MSONable from pydantic import BaseModel from pydantic._internal._utils import lenient_issubclass from pydantic.fields import FieldInfo from maggma.utils import get_flat_models_from_model QUERY_PARAMS = ["criteria", "properties", "skip", "limit"] STORE_PARAMS = dict[ Literal[ "criteria", "properties", "sort", "skip", "limit", "request", "pipeline", "count_hint", "agg_hint", "update", ], Any, ] def merge_queries(queries: list[STORE_PARAMS]) -> STORE_PARAMS: criteria: STORE_PARAMS = {} properties: list[str] = [] for sub_query in queries: if "criteria" in sub_query: criteria.update(sub_query["criteria"]) if "properties" in sub_query: properties.extend(sub_query["properties"]) remainder = {k: v for query in queries for k, v in query.items() if k not in ["criteria", "properties"]} return { "criteria": criteria, "properties": properties if len(properties) > 0 else None, **remainder, } def attach_signature(function: Callable, defaults: dict, annotations: dict): """ Attaches signature for defaults and annotations for parameters to function. Args: function: callable function to attach the signature to defaults: dictionary of parameters -> default values annotations: dictionary of type annotations for the parameters """ required_params = [ inspect.Parameter( param, inspect.Parameter.POSITIONAL_OR_KEYWORD, default=defaults.get(param), annotation=annotations.get(param), ) for param in annotations if param not in defaults ] optional_params = [ inspect.Parameter( param, inspect.Parameter.POSITIONAL_OR_KEYWORD, default=defaults.get(param), annotation=annotations.get(param), ) for param in defaults ] function.__signature__ = inspect.Signature(required_params + optional_params) def api_sanitize( pydantic_model: BaseModel, fields_to_leave: Optional[Union[str, None]] = None, allow_dict_msonable=False, ): """Function to clean up pydantic models for the API by: 1.) Making fields optional 2.) Allowing dictionaries in-place of the objects for MSONable quantities. WARNING: This works in place, so it mutates the model and all sub-models Args: pydantic_model (BaseModel): Pydantic model to alter fields_to_leave (list[str] | None): list of strings for model fields as "model__name__.field". Defaults to None. allow_dict_msonable (bool): Whether to allow dictionaries in place of MSONable quantities. Defaults to False """ models = [ model for model in get_flat_models_from_model(pydantic_model) if issubclass(model, BaseModel) ] # type: list[BaseModel] fields_to_leave = fields_to_leave or [] fields_tuples = [f.split(".") for f in fields_to_leave] assert all(len(f) == 2 for f in fields_tuples) for model in models: model_fields_to_leave = {f[1] for f in fields_tuples if model.__name__ == f[0]} for name in model.model_fields: field = model.model_fields[name] field_type = field.annotation if field_type is not None and allow_dict_msonable: if lenient_issubclass(field_type, MSONable): field_type = allow_msonable_dict(field_type) else: for sub_type in get_args(field_type): if lenient_issubclass(sub_type, MSONable): allow_msonable_dict(sub_type) if name not in model_fields_to_leave: new_field = FieldInfo.from_annotated_attribute(Optional[field_type], None) model.model_fields[name] = new_field model.model_rebuild(force=True) return pydantic_model def allow_msonable_dict(monty_cls: type[MSONable]): """ Patch Monty to allow for dict values for MSONable. """ def validate_monty(cls, v, _): """ Stub validator for MSONable as a dictionary only. """ if isinstance(v, cls): return v elif isinstance(v, dict): # Just validate the simple Monty Dict Model errors = [] if v.get("@module", "") != monty_cls.__module__: errors.append("@module") if v.get("@class", "") != monty_cls.__name__: errors.append("@class") if len(errors) > 0: raise ValueError("Missing Monty serialization fields in dictionary: {errors}") return v else: raise ValueError(f"Must provide {cls.__name__} or MSONable dictionary") monty_cls.validate_monty_v2 = classmethod(validate_monty) return monty_cls def serialization_helper(obj): if isinstance(obj, ObjectId): return str(obj) elif isinstance(obj, bytes): return base64.b64encode(obj).decode("utf-8") raise TypeError maggma-0.70.0/src/maggma/builders/000077500000000000000000000000001470132070100166535ustar00rootroot00000000000000maggma-0.70.0/src/maggma/builders/__init__.py000066400000000000000000000003331470132070100207630ustar00rootroot00000000000000from maggma.builders.group_builder import GroupBuilder from maggma.builders.map_builder import CopyBuilder, MapBuilder from maggma.core import Builder __all__ = ["GroupBuilder", "CopyBuilder", "MapBuilder", "Builder"] maggma-0.70.0/src/maggma/builders/group_builder.py000066400000000000000000000203561470132070100220750ustar00rootroot00000000000000""" Many-to-Many GroupBuilder. """ import traceback from abc import ABCMeta, abstractmethod from collections.abc import Iterable, Iterator from datetime import datetime from math import ceil from time import time from typing import Optional from pydash import get from maggma.core import Builder, Store from maggma.utils import Timeout, grouper class GroupBuilder(Builder, metaclass=ABCMeta): """ Group source docs and produces merged documents for each group Supports incremental building, where a source group gets (re)built only if it has a newer (by last_updated_field) doc than the corresponding (by key) target doc. This is a Many-to-One or Many-to-Many Builder. As a result, this builder can't determine when a source document is orphaned. """ def __init__( self, source: Store, target: Store, grouping_keys: list[str], query: Optional[dict] = None, projection: Optional[list] = None, timeout: int = 0, store_process_time: bool = True, retry_failed: bool = False, **kwargs, ): """ Args: source: source store target: target store query: optional query to filter items from the source store. projection: list of keys to project from the source for processing. Limits data transfer to improve efficiency. delete_orphans: Whether to delete documents on target store with key values not present in source store. Deletion happens after all updates, during Builder.finalize. timeout: maximum running time per item in seconds store_process_time: If True, add "_process_time" key to document for profiling purposes retry_failed: If True, will retry building documents that previously failed. """ self.source = source self.target = target self.grouping_keys = grouping_keys self.query = query if query else {} self.projection = projection self.kwargs = kwargs self.timeout = timeout self.store_process_time = store_process_time self.retry_failed = retry_failed self._target_keys_field = f"{self.source.key}s" super().__init__(sources=[source], targets=[target], **kwargs) def ensure_indexes(self): """ Ensures indices on critical fields for GroupBuilder which include the plural version of the target's key field. """ index_checks = [ self.source.ensure_index(self.source.key), self.source.ensure_index(self.source.last_updated_field), self.target.ensure_index(self.target.key), self.target.ensure_index(self.target.last_updated_field), self.target.ensure_index("state"), self.target.ensure_index(self._target_keys_field), ] if not all(index_checks): self.logger.warning( "Missing one or more important indices on stores. " "Performance for large stores may be severely degraded. " "Ensure indices on target.key and " "[(store.last_updated_field, -1), (store.key, 1)] " "for each of source and target." ) def prechunk(self, number_splits: int) -> Iterator[dict]: """ Generic prechunk for group builder to perform domain-decomposition by the grouping keys. """ self.ensure_indexes() keys = self.get_ids_to_process() groups = self.get_groups_from_keys(keys) N = ceil(len(groups) / number_splits) for split in grouper(keys, N): yield {"query": dict(zip(self.grouping_keys, split))} def get_items(self): self.logger.info(f"Starting {self.__class__.__name__} Builder") self.ensure_indexes() keys = self.get_ids_to_process() groups = self.get_groups_from_keys(keys) if self.projection: projection = list({*self.projection, self.source.key, self.source.last_updated_field}) else: projection = None self.total = len(groups) for group in groups: group_criteria = dict(zip(self.grouping_keys, group)) group_criteria.update(self.query) yield list(self.source.query(criteria=group_criteria, properties=projection)) def process_item(self, item: list[dict]) -> dict[tuple, dict]: # type: ignore keys = [d[self.source.key] for d in item] self.logger.debug(f"Processing: {keys}") time_start = time() try: with Timeout(seconds=self.timeout): processed = self.unary_function(item) processed.update({"state": "successful"}) except Exception as e: self.logger.error(traceback.format_exc()) processed = {"error": str(e), "state": "failed"} time_end = time() last_updated = [self.source._lu_func[0](d[self.source.last_updated_field]) for d in item] update_doc = { self.target.key: keys[0], f"{self.source.key}s": keys, self.target.last_updated_field: max(last_updated), "_bt": datetime.utcnow(), } processed.update({k: v for k, v in update_doc.items() if k not in processed}) if self.store_process_time: processed["_process_time"] = time_end - time_start return processed def update_targets(self, items: list[dict]): """ Generic update targets for Group Builder. """ target = self.target for item in items: if "_id" in item: del item["_id"] if len(items) > 0: target.update(items) @abstractmethod def unary_function(self, items: list[dict]) -> dict: """ Processing function for GroupBuilder. Arguments: items: list of of documents with matching grouping keys Returns: Dictionary mapping: tuple of source document keys that are in the grouped document to the grouped and processed document """ def get_ids_to_process(self) -> Iterable: """ Gets the IDs that need to be processed. """ distinct_from_target = list(self.target.distinct(self._target_keys_field, criteria=self.query)) processed_ids = [] # Not always guaranteed that MongoDB will unpack the list so we # have to make sure we do that for d in distinct_from_target: if isinstance(d, list): processed_ids.extend(d) else: processed_ids.append(d) all_ids = set(self.source.distinct(self.source.key, criteria=self.query)) self.logger.debug(f"Found {len(all_ids)} total docs in source") if self.retry_failed: failed_keys = self.target.distinct(self._target_keys_field, criteria={"state": "failed", **self.query}) unprocessed_ids = all_ids - (set(processed_ids) - set(failed_keys)) self.logger.debug(f"Found {len(failed_keys)} failed IDs in target") else: unprocessed_ids = all_ids - set(processed_ids) self.logger.info(f"Found {len(unprocessed_ids)} IDs to process") new_ids = set(self.source.newer_in(self.target, criteria=self.query, exhaustive=False)) self.logger.info(f"Found {len(new_ids)} updated IDs to process") return list(new_ids | unprocessed_ids) def get_groups_from_keys(self, keys) -> set[tuple]: """ Get the groups by grouping_keys for these documents. """ grouping_keys = self.grouping_keys groups: set[tuple] = set() for chunked_keys in grouper(keys, self.chunk_size): docs = list( self.source.query( criteria={self.source.key: {"$in": chunked_keys}}, properties=grouping_keys, ) ) sub_groups = {tuple(get(d, prop, None) for prop in grouping_keys) for d in docs} self.logger.debug(f"Found {len(sub_groups)} subgroups to process") groups |= sub_groups self.logger.info(f"Found {len(groups)} groups to process") return groups maggma-0.70.0/src/maggma/builders/map_builder.py000066400000000000000000000166251470132070100215220ustar00rootroot00000000000000""" One-to-One Map Builder and a simple CopyBuilder implementation. """ import traceback from abc import ABCMeta, abstractmethod from collections.abc import Iterator from datetime import datetime from math import ceil from time import time from typing import Optional from maggma.core import Builder, Store from maggma.utils import Timeout, grouper class MapBuilder(Builder, metaclass=ABCMeta): """ Apply a unary function to yield a target document for each source document. Supports incremental building, where a source document gets built only if it has newer (by last_updated_field) data than the corresponding (by key) target document. """ def __init__( self, source: Store, target: Store, query: Optional[dict] = None, projection: Optional[list] = None, delete_orphans: bool = False, timeout: int = 0, store_process_time: bool = True, retry_failed: bool = False, **kwargs, ): """ Apply a unary function to each source document. Args: source: source store target: target store query: optional query to filter source store projection: list of keys to project from the source for processing. Limits data transfer to improve efficiency. delete_orphans: Whether to delete documents on target store with key values not present in source store. Deletion happens after all updates, during Builder.finalize. timeout: maximum running time per item in seconds store_process_time: If True, add "_process_time" key to document for profiling purposes retry_failed: If True, will retry building documents that previously failed """ self.source = source self.target = target self.query = query self.projection = projection self.delete_orphans = delete_orphans self.kwargs = kwargs self.timeout = timeout self.store_process_time = store_process_time self.retry_failed = retry_failed super().__init__(sources=[source], targets=[target], **kwargs) def ensure_indexes(self): """ Ensures indices on critical fields for MapBuilder. """ index_checks = [ self.source.ensure_index(self.source.key), self.source.ensure_index(self.source.last_updated_field), self.target.ensure_index(self.target.key), self.target.ensure_index(self.target.last_updated_field), self.target.ensure_index("state"), ] if not all(index_checks): self.logger.warning( "Missing one or more important indices on stores. " "Performance for large stores may be severely degraded. " "Ensure indices on target.key and " "[(store.last_updated_field, -1), (store.key, 1)] " "for each of source and target." ) def prechunk(self, number_splits: int) -> Iterator[dict]: """ Generic prechunk for map builder to perform domain-decomposition by the key field. """ self.ensure_indexes() keys = self.target.newer_in(self.source, criteria=self.query, exhaustive=True) N = ceil(len(keys) / number_splits) for split in grouper(keys, N): yield {"query": {self.source.key: {"$in": list(split)}}} def get_items(self): """ Generic get items for Map Builder designed to perform incremental building. """ self.logger.info(f"Starting {self.__class__.__name__} Builder") self.ensure_indexes() keys = self.target.newer_in(self.source, criteria=self.query, exhaustive=True) if self.retry_failed: if isinstance(self.query, (dict)): failed_query = {"$and": [self.query, {"state": "failed"}]} else: failed_query = {"state": "failed"} failed_keys = self.target.distinct(self.target.key, criteria=failed_query) keys = list(set(keys + failed_keys)) self.logger.info(f"Processing {len(keys)} items") if self.projection: projection = list({*self.projection, self.source.key, self.source.last_updated_field}) else: projection = None self.total = len(keys) for chunked_keys in grouper(keys, self.chunk_size): chunked_keys = list(chunked_keys) yield from list( self.source.query( criteria={self.source.key: {"$in": chunked_keys}}, properties=projection, ) ) def process_item(self, item: dict): """ Generic process items to process a dictionary using a map function. """ self.logger.debug(f"Processing: {item[self.source.key]}") time_start = time() try: with Timeout(seconds=self.timeout): processed = dict(self.unary_function(item)) processed.update({"state": "successful"}) for k in [self.source.key, self.source.last_updated_field]: if k in processed: del processed[k] except Exception as e: self.logger.error(traceback.format_exc()) processed = {"error": str(e), "state": "failed"} time_end = time() key, last_updated_field = self.source.key, self.source.last_updated_field out = { self.target.key: item[key], self.target.last_updated_field: self.source._lu_func[0](item.get(last_updated_field, datetime.utcnow())), } if self.store_process_time: out["_process_time"] = time_end - time_start out.update(processed) return out def update_targets(self, items: list[dict]): """ Generic update targets for Map Builder. """ target = self.target for item in items: item["_bt"] = datetime.utcnow() if "_id" in item: del item["_id"] if len(items) > 0: target.update(items) def finalize(self): """ Finalize MapBuilder operations including removing orphaned documents. """ if self.delete_orphans: source_keyvals = set(self.source.distinct(self.source.key)) target_keyvals = set(self.target.distinct(self.target.key)) to_delete = list(target_keyvals - source_keyvals) if len(to_delete): self.logger.info(f"Finalize: Deleting {len(to_delete)} orphans.") self.target.remove_docs({self.target.key: {"$in": to_delete}}) super().finalize() @abstractmethod def unary_function(self, item): """ ufn: Unary function to process item You do not need to provide values for source.key and source.last_updated_field in the output. Any uncaught exceptions will be caught by process_item and logged to the "error" field in the target document. """ class CopyBuilder(MapBuilder): """Sync a source store with a target store.""" def unary_function(self, item): """ Identity function for copy builder map operation. """ if "_id" in item: del item["_id"] return item maggma-0.70.0/src/maggma/builders/projection_builder.py000066400000000000000000000250041470132070100231100ustar00rootroot00000000000000from collections.abc import Iterable from copy import deepcopy from datetime import datetime from itertools import chain from typing import Optional, Union from pydash import get from maggma.core import Builder, Store from maggma.utils import grouper class Projection_Builder(Builder): """ This builder creates new documents that combine information from multiple input stores. These summary documents are then added to the specified target store. Key values are used for matching such that multiple docs from the input stores with the same key value will be combined into a single doc for that key value in the target store. Built in functionalities include user specification of which fields to project into the target store from each input store, renaming projected fields, and limiting the builder to only consider certain key values. """ def __init__( self, source_stores: list[Store], target_store: Store, fields_to_project: Union[list[Union[list, dict]], None] = None, query_by_key: Optional[list] = None, **kwargs, ): """ Args: source_stores ([MongoStore]): List of stores. Fields from these input stores will be projected into target_store target_store (MongoStore): Store where the summary/aggregated output documents produced will be stored fields_to_project ([List,Dict]): If provided, the order of items in this list must correspond to source_stores. By default, all fields of source_stores are projected into target_store. List elements can be provided as 1) a list of strings specifying the fields to pull from each input store, or 2) a dictionary where the values specify the fields to pull from the input store and the keys specify what field that will be used in the target store. e.g. ["field1","field2"] would be equivalent to {"field1":"field1", "field2":"field2"} Or fields could be renamed in the target stores via {"newname1":"field1", "newname2":"field2"} If an empty list or dictionary is provided, all fields of that input store will be projected. Note fields_to_project is converted into the projection_mapping attribute of this builder. There are no checks for possible overwrite errors in output docs for the target_store. query_by_key (List): Provide a list of keys to limit this builder to a only consider a subset of docs with these key values. By default, every document from the input stores will be projected. """ # check for user input errors if isinstance(source_stores, list) is False: raise TypeError("Input source_stores must be provided in a list") if isinstance(fields_to_project, list): if len(source_stores) != len(fields_to_project): raise ValueError("There must be an equal number of elements in source_stores and fields_to_project") elif fields_to_project is not None: raise TypeError("Input fields_to_project must be a list. E.g. [['str1','str2'],{'A':'str1','B':str2'}]") # interpret fields_to_project to create projection_mapping attribute projection_mapping: list[dict] # PEP 484 Type Hinting if fields_to_project is None: projection_mapping = [{}] * len(source_stores) else: projection_mapping = [] for f in fields_to_project: if isinstance(f, (list)): projection_mapping.append({i: i for i in f}) elif isinstance(f, (dict)): projection_mapping.append(f) else: raise TypeError( """Input fields_to_project elements must be a list or dict. E.g. [['str1','str2'],{'A':'str1','B':str2'}]""" ) # ensure key is included in projection for get_items query for store, p in zip(source_stores, projection_mapping): if p != {}: p.update({target_store.key: store.key}) self.projection_mapping = projection_mapping # establish other attributes and initialization self.query_by_key = query_by_key or [] self.target = target_store super().__init__(sources=source_stores, targets=target_store, **kwargs) self.ensure_indexes() def ensure_indexes(self): """ Ensures key fields are indexed to improve querying efficiency. """ index_checks = [s.ensure_index(s.key) for s in self.sources] if not all(index_checks): self.logger.warning("Missing indices for key fields on stores.") def get_items(self) -> Iterable: """ Gets items from source_stores for processing. Items are retrieved in chunks based on a subset of key values set by chunk_size but are unsorted. Returns: generator of items to process """ self.logger.info(f"Starting {self.__class__.__name__} get_items...") # get distinct key values if len(self.query_by_key) > 0: keys = self.query_by_key else: unique_keys = set() # type: Set for store in self.sources: store_keys = store.distinct(field=store.key) unique_keys.update(store_keys) if None in store_keys: self.logger.debug( f"None found as a key value for store {store.collection_name} with key {store.key}" ) keys = list(unique_keys) self.logger.info(f"{len(keys)} distinct key values found") self.logger.debug(f"None found in key values? {None in keys}") # for every key (in chunks), query from each store and # project fields specified by projection_mapping for chunked_keys in grouper(keys, self.chunk_size): chunked_keys = [k for k in chunked_keys if k is not None] self.logger.debug(f"Querying by chunked_keys: {chunked_keys}") unsorted_items_to_process = [] for store, projection in zip(self.sources, self.projection_mapping): # project all fields from store if corresponding element # in projection_mapping is an empty dict, # else only project the specified fields properties: Union[list, None] if projection == {}: # all fields are projected properties = None self.logger.debug(f"For store {store.collection_name} getting all properties") else: # only specified fields are projected properties = list(projection.values()) self.logger.debug(f"For {store.collection_name} store getting properties: {properties}") # get docs from store for given chunk of key values, # rename fields if specified by projection mapping, # and put in list of unsorted items to be processed docs = store.query(criteria={store.key: {"$in": chunked_keys}}, properties=properties) for d in docs: if properties is None: # all fields are projected as is item = deepcopy(d) else: # specified fields are renamed item = dict() for k, v in projection.items(): item[k] = get(d, v) # remove unneeded fields and add key value to each item # key value stored under target_key is used for sorting # items during the process_items step for k in ["_id", store.last_updated_field]: if k in item: del item[k] item[self.target.key] = d[store.key] unsorted_items_to_process.append(item) self.logger.debug( f"Example fields of one output item from {store.collection_name} store sent to" "process_items: {item.keys()}" ) yield unsorted_items_to_process def process_item(self, items: Union[list, Iterable]) -> list[dict]: """ Takes a chunk of items belonging to a subset of key values and groups them by key value. Combines items for each key value into one single doc for the target store. Arguments: items: items should all belong to a subset of key values but are not in any particular order Returns: items_for_target: a list of items where now each item corresponds to a single key value """ self.logger.info("Processing items: sorting by key values...") key = self.target.key items_sorted_by_key = {} # type: Dict for i in items: key_value = i[key] if key_value not in items_sorted_by_key: items_sorted_by_key[key_value] = [] items_sorted_by_key[key_value].append(i) items_for_target = [] for k, i_sorted in items_sorted_by_key.items(): self.logger.debug(f"Combined items for {key}: {k}") target_doc: dict = {} for i in i_sorted: target_doc.update(i) # last modification is adding key value avoid overwriting target_doc[key] = k items_for_target.append(target_doc) # note target last_updated_field will be added during update_targets() return items_for_target def update_targets(self, items: list): """ Adds a last_updated field to items and then adds them to the target store. Arguments: items: a list of items where each item contains all the information from the source_stores corresponding to a single key value """ items = list(filter(None, chain.from_iterable(items))) num_items = len(items) self.logger.info(f"Updating target with {num_items} items...") target = self.target target_insertion_time = datetime.utcnow() for item in items: item[target.last_updated_field] = target_insertion_time if num_items > 0: target.update(items) maggma-0.70.0/src/maggma/cli/000077500000000000000000000000001470132070100156115ustar00rootroot00000000000000maggma-0.70.0/src/maggma/cli/__init__.py000066400000000000000000000146141470132070100177300ustar00rootroot00000000000000#!/usr/bin/env python # coding utf-8 import asyncio import logging import sys from datetime import datetime from itertools import chain import click from monty.serialization import loadfn from maggma.cli.distributed import find_port from maggma.cli.multiprocessing import multi from maggma.cli.serial import serial from maggma.cli.settings import CLISettings from maggma.cli.source_loader import ScriptFinder, load_builder_from_source from maggma.utils import ReportingHandler, TqdmLoggingHandler sys.meta_path.append(ScriptFinder()) settings = CLISettings() @click.command() @click.argument("builders", nargs=-1, type=click.Path(exists=True), required=True) @click.option( "-v", "--verbose", "verbosity", count=True, help="Controls logging level per number of v's", default=0, ) @click.option( "-n", "--num-processes", "num_processes", help="Number of processes to spawn for each worker. Defaults to single processing", default=1, type=click.IntRange(1), ) @click.option( "-r", "--reporting", "reporting_store", help="Store in JSON/YAML form to send reporting data to", type=click.Path(exists=True), ) @click.option("-u", "--url", "url", default=None, type=str, help="URL for the distributed manager") @click.option( "-p", "--port", "port", default=None, type=int, help="Port for distributed communication. mrun will find an open port if None is provided to the manager", ) @click.option( "-N", "--num-chunks", "num_chunks", default=0, type=int, help="Number of chunks to distribute to workers", ) @click.option( "-w", "--num-workers", "num_workers", default=0, type=int, help="Number of distributed workers to process chunks", ) @click.option("--no_bars", is_flag=True, help="Turns of Progress Bars for headless operations") @click.option("--rabbitmq", is_flag=True, help="Enables the use of RabbitMQ as the work broker") @click.option( "-q", "--queue_prefix", "queue_prefix", default="builder", type=str, help="Prefix to use in queue names when RabbitMQ is select as the broker", ) @click.option( "-m", "--memray", "memray", default=False, type=bool, help="Option to profile builder memory usage with Memray", ) @click.option( "-md", "--memray-dir", "memray_dir", default=None, type=str, help="""Directory to dump memory profiler output files. Only runs if --memray is True. Will create directory if directory does not exist, mimicking mkdir -p command. If not provided files will be dumped to system's temp directory""", ) @click.pass_context def run( ctx, builders, verbosity, reporting_store, num_workers, url, port, num_chunks, no_bars, num_processes, rabbitmq, queue_prefix, memray, memray_dir, memray_file=None, follow_fork=False, ): # Import profiler and setup directories to dump profiler output if memray: from memray import FileDestination, Tracker if memray_dir: import os os.makedirs(memray_dir, exist_ok=True) memray_file = f"{memray_dir}/{builders[0]}_{datetime.now().isoformat()}.bin" else: memray_file = f"{settings.TEMP_DIR}/{builders[0]}_{datetime.now().isoformat()}.bin" if num_processes > 1: follow_fork = True # Click context manager handles creation and clean up of profiler dump files for memray tracker ctx.obj = ctx.with_resource( Tracker( destination=FileDestination(memray_file), native_traces=False, trace_python_allocators=False, follow_fork=follow_fork, ) ) # Import proper manager and worker if rabbitmq: from maggma.cli.rabbitmq import manager, worker else: from maggma.cli.distributed import manager, worker # Set Logging levels = [logging.WARNING, logging.INFO, logging.DEBUG] level = levels[min(len(levels) - 1, verbosity)] # capped to number of levels root = logging.getLogger() root.setLevel(level) ch = TqdmLoggingHandler() formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") ch.setFormatter(formatter) root.addHandler(ch) builder_objects = [] for b in builders: if str(b).endswith(".py") or str(b).endswith(".ipynb"): builder_objects.append(load_builder_from_source(b)) else: builder_objects.append(loadfn(b)) builder_objects = [b if isinstance(b, list) else [b] for b in builder_objects] builder_objects = list(chain.from_iterable(builder_objects)) if reporting_store: reporting_store = loadfn(reporting_store) root.addHandler(ReportingHandler(reporting_store)) if url: if num_chunks > 0: # Manager if port is None: port = find_port() root.critical(f"Using random port for mrun manager: {port}") if rabbitmq: manager( url=url, port=port, builders=builder_objects, num_chunks=num_chunks, num_workers=num_workers, queue_prefix=queue_prefix, ) else: manager( url=url, port=port, builders=builder_objects, num_chunks=num_chunks, num_workers=num_workers, ) else: # Worker if rabbitmq: worker( url=url, port=port, num_processes=num_processes, no_bars=no_bars, queue_prefix=queue_prefix, ) else: worker(url=url, port=port, num_processes=num_processes, no_bars=no_bars) else: if num_processes == 1: for builder in builder_objects: serial(builder, no_bars) else: loop = asyncio.get_event_loop() for builder in builder_objects: loop.run_until_complete(multi(builder=builder, num_processes=num_processes, no_bars=no_bars)) if memray_file: import subprocess subprocess.run(["memray", "flamegraph", memray_file], shell=False, check=False) maggma-0.70.0/src/maggma/cli/distributed.py000066400000000000000000000247331470132070100205160ustar00rootroot00000000000000#!/usr/bin/env python # coding utf-8 import asyncio import json import socket as pysocket from logging import getLogger from random import randint from time import perf_counter import numpy as np import zmq from monty.json import jsanitize from monty.serialization import MontyDecoder from maggma.cli.multiprocessing import multi from maggma.cli.settings import CLISettings from maggma.core import Builder from maggma.utils import tqdm settings = CLISettings() def find_port(): sock = pysocket.socket() sock.bind(("", 0)) return sock.getsockname()[1] def manager(url: str, port: int, builders: list[Builder], num_chunks: int, num_workers: int): """ Really simple manager for distributed processing that uses a builder prechunk to modify the builder and send out modified builders for each worker to run. The manager will try and keep track of workers, including which error out and which complete. Currently, if a single workers fails the entire distributed job will be stopped. """ logger = getLogger("Manager") if not (num_chunks and num_workers): raise ValueError("Both num_chunks and num_workers must be non-zero") logger.info(f"Binding to Manager URL {url}:{port}") # Setup socket and polling socket, poll = setup(url, port) workers = {} # type: ignore logger.debug("Manager started and looking for workers") for builder in builders: logger.info(f"Working on {builder.__class__.__name__}") builder_dict = builder.as_dict() try: builder.connect() chunk_dicts = [{"chunk": d, "distributed": False, "completed": False} for d in builder.prechunk(num_chunks)] pbar_distributed = tqdm( total=len(chunk_dicts), desc=f"Distributed chunks for {builder.__class__.__name__}", ) pbar_completed = tqdm( total=len(chunk_dicts), desc=f"Completed chunks for {builder.__class__.__name__}", ) logger.info(f"Distributing {len(chunk_dicts)} chunks to workers") except NotImplementedError: attempt_graceful_shutdown(workers, socket) raise RuntimeError(f"Can't distribute process {builder.__class__.__name__} as no prechunk method exists.") completed = False while not completed: completed = all(chunk["completed"] for chunk in chunk_dicts) if num_workers <= 0: socket.close() raise RuntimeError("No workers to distribute chunks to") # Poll and look for messages from workers connections = dict(poll.poll(100)) # If workers send messages decode and figure out what do if connections: identity, _, bmsg = socket.recv_multipart() msg = bmsg.decode("utf-8") if "READY" in msg: if identity not in workers: logger.debug(f"Got connection from worker: {msg.split('_')[1]}") workers[identity] = { "working": False, "heartbeats": 1, "last_ping": perf_counter(), "work_index": -1, } else: workers[identity]["working"] = False work_ind = workers[identity]["work_index"] if work_ind != -1: chunk_dicts[work_ind]["completed"] = True # type: ignore pbar_completed.update(1) # If everything is distributed, send EXIT to the worker if all(chunk["distributed"] for chunk in chunk_dicts): logger.debug(f"Sending exit signal to worker: {msg.split('_')[1]}") socket.send_multipart([identity, b"", b"EXIT"]) workers.pop(identity) elif "ERROR" in msg: # Remove worker and requeue work sent to it attempt_graceful_shutdown(workers, socket) raise RuntimeError( "At least one worker has stopped with error message: {}".format(msg.split("_")[1]) ) elif msg == "PING": # Respond to heartbeat socket.send_multipart([identity, b"", b"PONG"]) workers[identity]["last_ping"] = perf_counter() workers[identity]["heartbeats"] += 1 # Decide if any workers are dead and need to be removed if settings.WORKER_TIMEOUT is not None: handle_dead_workers(workers, socket) for work_index, chunk_dict in enumerate(chunk_dicts): if not chunk_dict["distributed"]: temp_builder_dict = dict(**builder_dict) temp_builder_dict.update(chunk_dict["chunk"]) # type: ignore temp_builder_dict = jsanitize(temp_builder_dict, recursive_msonable=True) # Send work for available workers for identity in workers: if not workers[identity]["working"]: # Send out a chunk to idle worker socket.send_multipart( [ identity, b"", json.dumps(temp_builder_dict).encode("utf-8"), ] ) workers[identity]["work_index"] = work_index workers[identity]["working"] = True chunk_dicts[work_index]["distributed"] = True pbar_distributed.update(1) # Send EXIT to any remaining workers logger.info("Sending exit messages to workers once they are done") attempt_graceful_shutdown(workers, socket) def setup(url, port): context = zmq.Context() context.setsockopt(opt=zmq.SocketOption.ROUTER_MANDATORY, value=1) context.setsockopt(opt=zmq.SNDHWM, value=0) context.setsockopt(opt=zmq.RCVHWM, value=0) socket = context.socket(zmq.ROUTER) socket.bind(f"{url}:{port}") poll = zmq.Poller() poll.register(socket, zmq.POLLIN) return socket, poll def attempt_graceful_shutdown(workers, socket): for identity in workers: socket.send_multipart([identity, b"", b"EXIT"]) socket.close() def handle_dead_workers(workers, socket): if len(workers) == 1: # Use global timeout identity = next(iter(workers.keys())) if (perf_counter() - workers[identity]["last_ping"]) >= settings.WORKER_TIMEOUT: attempt_graceful_shutdown(workers, socket) raise RuntimeError("Worker has timed out. Stopping distributed build.") elif len(workers) == 2: # Use 10% ratio between workers workers_sorted = sorted(workers.items(), key=lambda x: x[1]["heartbeats"]) ratio = workers_sorted[1][1]["heartbeats"] / workers_sorted[0][1]["heartbeats"] if ratio <= 0.1: attempt_graceful_shutdown(workers, socket) raise RuntimeError("One worker has timed out. Stopping distributed build.") elif len(workers) > 2: # Calculate modified z-score of heartbeat counts and see if any are <= -3.5 hearbeat_vals = [w["heartbeats"] for w in workers.values()] median = np.median(hearbeat_vals) mad = np.median([abs(i - median) for i in hearbeat_vals]) if mad > 0: for identity in list(workers.keys()): z_score = 0.6745 * (workers[identity]["heartbeats"] - median) / mad if z_score <= -3.5: attempt_graceful_shutdown(workers, socket) raise RuntimeError("At least one worker has timed out. Stopping distributed build.") def worker(url: str, port: int, num_processes: int, no_bars: bool): """ Simple distributed worker that connects to a manager asks for work and deploys using multiprocessing. """ identity = f"{randint(0, 0x10000):04X}-{randint(0, 0x10000):04X}" logger = getLogger(f"Worker {identity}") logger.info(f"Connecting to Manager at {url}:{port}") context = zmq.Context() socket: zmq.Socket = context.socket(zmq.REQ) socket.setsockopt_string(zmq.IDENTITY, identity) socket.connect(f"{url}:{port}") poller = zmq.Poller() poller.register(socket, zmq.POLLIN) # Initial message package hostname = pysocket.gethostname() try: running = True while running: socket.send(f"READY_{hostname}".encode()) # Poll for MANAGER_TIMEOUT seconds, if nothing is given then assume manager is dead and timeout connections = dict(poller.poll(settings.MANAGER_TIMEOUT * 1000)) if not connections: socket.close() raise RuntimeError("Stopping work as manager timed out.") bmessage: bytes = socket.recv() message = bmessage.decode("utf-8") if "@class" in message and "@module" in message: # We have a valid builder work = json.loads(message) builder = MontyDecoder().process_decoded(work) asyncio.run( multi( builder, num_processes, no_bars=no_bars, heartbeat_func=ping_manager, heartbeat_func_kwargs={"socket": socket, "poller": poller}, ) ) elif message == "EXIT": # End the worker running = False except Exception as e: logger.error(f"A worker failed with error: {e}") socket.send(f"ERROR_{e}".encode()) socket.close() socket.close() def ping_manager(socket, poller): socket.send_string("PING") # Poll for MANAGER_TIMEOUT seconds, if nothing is given then assume manager is dead and timeout connections = dict(poller.poll(settings.MANAGER_TIMEOUT * 1000)) if not connections: socket.close() raise RuntimeError("Stopping work as manager timed out.") message: bytes = socket.recv() if message.decode("utf-8") != "PONG": socket.close() raise RuntimeError("Stopping work as manager did not respond to heartbeat from worker.") maggma-0.70.0/src/maggma/cli/multiprocessing.py000066400000000000000000000152041470132070100214140ustar00rootroot00000000000000#!/usr/bin/env python # coding utf-8 from asyncio import BoundedSemaphore, Queue, gather, get_event_loop from concurrent.futures import ProcessPoolExecutor from logging import getLogger from types import GeneratorType from typing import Any, Callable, Optional from aioitertools import enumerate from tqdm.auto import tqdm from maggma.utils import primed logger = getLogger("MultiProcessor") class BackPressure: """ Wrapper for an iterator to provide async access with backpressure. """ def __init__(self, iterator, n): self.iterator = iter(iterator) self.back_pressure = BoundedSemaphore(n) def __aiter__(self): return self async def __anext__(self): await self.back_pressure.acquire() try: return next(self.iterator) except StopIteration: raise StopAsyncIteration async def release(self, async_iterator): """ release iterator to pipeline the backpressure. """ async for item in async_iterator: try: self.back_pressure.release() except ValueError: pass yield item class AsyncUnorderedMap: """ Async iterator that maps a function to an async iterator using an executor and returns items as they are done This does not guarantee order. """ def __init__(self, func, async_iterator, executor): self.iterator = async_iterator self.func = func self.executor = executor loop = get_event_loop() self.fill_task = loop.create_task(self.get_from_iterator()) self.done_sentinel = object() self.results = Queue() self.tasks = {} async def process_and_release(self, idx): future = self.tasks[idx] try: item = await future self.results.put_nowait(item) except Exception: pass finally: self.tasks.pop(idx) async def get_from_iterator(self): loop = get_event_loop() async for idx, item in enumerate(self.iterator): future = loop.run_in_executor(self.executor, safe_dispatch, (self.func, item)) self.tasks[idx] = future # TODO - line below raises RUF006 error. Unsure about the best way to # resolve. See https://docs.astral.sh/ruff/rules/asyncio-dangling-task/ loop.create_task(self.process_and_release(idx)) # noqa: RUF006 await gather(*self.tasks.values()) self.results.put_nowait(self.done_sentinel) def __aiter__(self): return self async def __anext__(self): item = await self.results.get() if item == self.done_sentinel: raise StopAsyncIteration return item async def atqdm(async_iterator, *args, **kwargs): """ Wrapper around tqdm for async generators. """ _tqdm = tqdm(*args, **kwargs) async for item in async_iterator: _tqdm.update() yield item _tqdm.close() async def grouper(async_iterator, n: int): """ Collect data into fixed-length chunks or blocks. >>> list(grouper(3, 'ABCDEFG')) [['A', 'B', 'C'], ['D', 'E', 'F'], ['G']]. Updated from: https://stackoverflow.com/questions/31164731/python-chunking-csv-file-multiproccessing/31170795#31170795 Modified for async """ chunk = [] async for item in async_iterator: chunk.append(item) if len(chunk) >= n: yield chunk chunk.clear() if chunk != []: yield chunk def safe_dispatch(val): func, item = val try: return func(item) except Exception as e: logger.error(e) return None async def multi( builder, num_processes, no_bars=False, heartbeat_func: Optional[Callable[..., Any]] = None, heartbeat_func_kwargs: Optional[dict[Any, Any]] = None, ): builder.connect() cursor = builder.get_items() executor = ProcessPoolExecutor(num_processes) # Gets the total number of items to process by priming # the cursor total = None if isinstance(cursor, GeneratorType): try: cursor = primed(cursor) if hasattr(builder, "total"): total = builder.total except StopIteration: pass elif hasattr(cursor, "__len__"): total = len(cursor) elif hasattr(cursor, "count"): total = cursor.count() logger.info( f"Starting multiprocessing: {builder.__class__.__name__}", extra={ "maggma": { "event": "BUILD_STARTED", "total": total, "builder": builder.__class__.__name__, "sources": [source.name for source in builder.sources], "targets": [target.name for target in builder.targets], } }, ) back_pressured_get = BackPressure( iterator=tqdm(cursor, desc="Get", total=total, disable=no_bars), n=builder.chunk_size, ) processed_items = atqdm( async_iterator=AsyncUnorderedMap( func=builder.process_item, async_iterator=back_pressured_get, executor=executor, ), total=total, desc="Process Items", disable=no_bars, ) if not heartbeat_func_kwargs: heartbeat_func_kwargs = {} if heartbeat_func: heartbeat_func(**heartbeat_func_kwargs) back_pressure_relief = back_pressured_get.release(processed_items) update_items = tqdm(total=total, desc="Update Targets", disable=no_bars) async for chunk in grouper(back_pressure_relief, n=builder.chunk_size): logger.info( f"Processed batch of {builder.chunk_size} items", extra={ "maggma": { "event": "UPDATE", "items": len(chunk), "builder": builder.__class__.__name__, "sources": [source.name for source in builder.sources], "targets": [target.name for target in builder.targets], } }, ) processed_items = [item for item in chunk if item is not None] builder.update_targets(processed_items) update_items.update(len(processed_items)) logger.info( f"Ended multiprocessing: {builder.__class__.__name__}", extra={ "maggma": { "event": "BUILD_ENDED", "builder": builder.__class__.__name__, "sources": [source.name for source in builder.sources], "targets": [target.name for target in builder.targets], } }, ) update_items.close() builder.finalize() maggma-0.70.0/src/maggma/cli/rabbitmq.py000066400000000000000000000263221470132070100177710ustar00rootroot00000000000000#!/usr/bin/env python # coding utf-8 import asyncio import json import socket as pysocket from logging import getLogger from random import randint from time import perf_counter from typing import Literal import numpy as np from monty.json import jsanitize from monty.serialization import MontyDecoder from maggma.cli.multiprocessing import multi from maggma.cli.settings import CLISettings from maggma.core import Builder from maggma.utils import Timeout, tqdm try: import pika except ImportError: raise ImportError("Both pika and aio-pika are required to use RabbitMQ as a broker") settings = CLISettings() def find_port(): sock = pysocket.socket() sock.bind(("", 0)) return sock.getsockname()[1] def manager( url: str, builders: list[Builder], num_chunks: int, num_workers: int, queue_prefix: str, port: int = 5672, ): """ Rabbit MQ manager for distributed processing that uses a builder prechunk to modify the builder and send them out each worker to run. """ logger = getLogger("Manager") if not (num_chunks and num_workers): raise ValueError("Both num_chunks and num_workers must be non-zero") url = url.split("//")[-1] logger.info(f"Binding to Manager URL {url}:{port}") # Setup connection to RabbitMQ and ensure on all queues is one unit connection, channel, status_queue, worker_queue = setup_rabbitmq(url, queue_prefix, port, "work") workers = {} # type: ignore logger.debug("Manager started and looking for workers") for builder in builders: logger.info(f"Working on {builder.__class__.__name__}") builder_dict = builder.as_dict() try: builder.connect() chunk_dicts = [{"chunk": d, "distributed": False, "completed": False} for d in builder.prechunk(num_chunks)] pbar_distributed = tqdm( total=len(chunk_dicts), desc=f"Distributed chunks for {builder.__class__.__name__}", ) pbar_completed = tqdm( total=len(chunk_dicts), desc=f"Completed chunks for {builder.__class__.__name__}", ) logger.info(f"Distributing {len(chunk_dicts)} chunks to workers") except NotImplementedError: attempt_graceful_shutdown(connection, workers, channel, worker_queue) raise RuntimeError(f"Can't distribute process {builder.__class__.__name__} as no prechunk method exists.") completed = False while not completed: completed = all(chunk["completed"] for chunk in chunk_dicts) if num_workers <= 0: connection.close() raise RuntimeError("No workers to distribute chunks to") # If workers send messages decode and figure out what do _, _, body = channel.basic_get(queue=status_queue, auto_ack=True) if body is not None: msg = body.decode("utf-8") identity = msg.split("_")[-1] if "READY" in msg: if identity not in workers: logger.debug(f"Got connection from worker: {msg.split('_')[1]}") workers[identity] = { "working": False, "heartbeats": 1, "last_ping": perf_counter(), "work_index": -1, } elif "DONE" in msg: workers[identity]["working"] = False work_ind = workers[identity]["work_index"] if work_ind != -1: chunk_dicts[work_ind]["completed"] = True # type: ignore pbar_completed.update(1) elif "ERROR" in msg: # Remove worker and requeue work sent to it attempt_graceful_shutdown(connection, workers, channel, worker_queue) raise RuntimeError( "At least one worker has stopped with error message: {}".format(msg.split("_")[1]) ) elif "PING" in msg: # Heartbeat from worker (no pong response) workers[identity]["last_ping"] = perf_counter() workers[identity]["heartbeats"] += 1 # Decide if any workers are dead and need to be removed handle_dead_workers(connection, workers, channel, worker_queue) for work_index, chunk_dict in enumerate(chunk_dicts): if not chunk_dict["distributed"]: temp_builder_dict = dict(**builder_dict) temp_builder_dict.update(chunk_dict["chunk"]) # type: ignore temp_builder_dict = jsanitize(temp_builder_dict, recursive_msonable=True) # Send work for available workers for identity in workers: if not workers[identity]["working"]: # Send out a chunk to idle worker channel.basic_publish( exchange="", routing_key=worker_queue, body=json.dumps(temp_builder_dict).encode("utf-8"), ) workers[identity]["work_index"] = work_index workers[identity]["working"] = True chunk_dicts[work_index]["distributed"] = True pbar_distributed.update(1) # Send EXIT to any remaining workers logger.info("Sending exit messages to workers once they are done") attempt_graceful_shutdown(connection, workers, channel, worker_queue) def setup_rabbitmq(url: str, queue_prefix: str, port: int, outbound_queue: Literal["status", "work"]): connection = pika.BlockingConnection(pika.ConnectionParameters(url, port)) channel = connection.channel() channel.basic_qos(prefetch_count=1, global_qos=True) # Ensure both worker status and work distribution queues exist status_queue = queue_prefix + "_status" worker_queue = queue_prefix + "_work" channel.queue_declare(queue=status_queue, auto_delete=True) channel.queue_declare(queue=worker_queue, auto_delete=True) # Clear out outbound queue if outbound_queue == "work": channel.queue_purge(queue=worker_queue) else: channel.queue_purge(queue=status_queue) return connection, channel, status_queue, worker_queue def attempt_graceful_shutdown(connection, workers, channel, worker_queue): for _ in workers: channel.basic_publish( exchange="", routing_key=worker_queue, body=b"EXIT", ) connection.close() def handle_dead_workers(connection, workers, channel, worker_queue): if len(workers) == 1: # Use global timeout identity = next(iter(workers.keys())) if (perf_counter() - workers[identity]["last_ping"]) >= settings.WORKER_TIMEOUT: attempt_graceful_shutdown(connection, workers, channel, worker_queue) raise RuntimeError("Worker has timed out. Stopping distributed build.") elif len(workers) == 2: # Use 10% ratio between workers workers_sorted = sorted(workers.items(), key=lambda x: x[1]["heartbeats"]) ratio = workers_sorted[1][1]["heartbeats"] / workers_sorted[0][1]["heartbeats"] if ratio <= 0.1: attempt_graceful_shutdown(connection, workers, channel, worker_queue) raise RuntimeError("One worker has timed out. Stopping distributed build.") elif len(workers) > 2: # Calculate modified z-score of heartbeat counts and see if any are <= -3.5 hearbeat_vals = [w["heartbeats"] for w in workers.values()] median = np.median(hearbeat_vals) mad = np.median([abs(i - median) for i in hearbeat_vals]) if mad > 0: for identity in list(workers.keys()): z_score = 0.6745 * (workers[identity]["heartbeats"] - median) / mad if z_score <= -3.5: attempt_graceful_shutdown(connection, workers, channel, worker_queue) raise RuntimeError("At least one worker has timed out. Stopping distributed build.") def worker(url: str, port: int, num_processes: int, no_bars: bool, queue_prefix: str): """ Simple distributed worker that connects to a manager asks for work and deploys using multiprocessing. """ identity = f"{randint(0, 0x10000):04X}-{randint(0, 0x10000):04X}" logger = getLogger(f"Worker {identity}") url = url.split("//")[-1] logger.info(f"Connecting to Manager at {url}:{port}") # Setup connection to RabbitMQ and ensure on all queues is one unit connection, channel, status_queue, worker_queue = setup_rabbitmq(url, queue_prefix, port, "status") # Send ready signal to status queue channel.basic_publish( exchange="", routing_key=status_queue, body=f"READY_{identity}".encode(), ) try: running = True while running: # Wait for work from manager with Timeout(seconds=settings.MANAGER_TIMEOUT): _, _, body = channel.basic_get(queue=worker_queue, auto_ack=True) if body is not None: message = body.decode("utf-8") if "@class" in message and "@module" in message: # We have a valid builder work = json.loads(message) builder = MontyDecoder().process_decoded(work) logger.info(f"Working on builder {builder.__class__}") channel.basic_publish( exchange="", routing_key=status_queue, body=f"WORKING_{identity}".encode(), ) work = json.loads(message) builder = MontyDecoder().process_decoded(work) asyncio.run( multi( builder, num_processes, no_bars=no_bars, heartbeat_func=ping_manager, heartbeat_func_kwargs={ "channel": channel, "identity": identity, "status_queue": status_queue, }, ) ) channel.basic_publish( exchange="", routing_key=status_queue, body=f"DONE_{identity}".encode(), ) elif message == "EXIT": # End the worker running = False except Exception as e: logger.error(f"A worker failed with error: {e!r}") channel.basic_publish( exchange="", routing_key=status_queue, body=f"ERROR_{identity}".encode(), ) connection.close() connection.close() def ping_manager(channel, identity, status_queue): channel.basic_publish( exchange="", routing_key=status_queue, body=f"PING_{identity}".encode(), ) maggma-0.70.0/src/maggma/cli/serial.py000066400000000000000000000037741470132070100174550ustar00rootroot00000000000000#!/usr/bin/env python # coding utf-8 import logging from types import GeneratorType from tqdm.auto import tqdm from maggma.core import Builder from maggma.utils import grouper, primed def serial(builder: Builder, no_bars=False): """ Runs the builders using a single process. """ logger = logging.getLogger("SerialProcessor") builder.connect() cursor = builder.get_items() total = None if isinstance(cursor, GeneratorType): try: cursor = primed(cursor) if hasattr(builder, "total"): total = builder.total except StopIteration: pass elif hasattr(cursor, "__len__"): total = len(cursor) # type: ignore elif hasattr(cursor, "count"): total = cursor.count() # type: ignore logger.info( f"Starting serial processing: {builder.__class__.__name__}", extra={ "maggma": { "event": "BUILD_STARTED", "total": total, "builder": builder.__class__.__name__, "sources": [source.name for source in builder.sources], "targets": [target.name for target in builder.targets], } }, ) for chunk in grouper(tqdm(cursor, total=total, disable=no_bars), builder.chunk_size): logger.info( f"Processing batch of {builder.chunk_size} items", extra={ "maggma": { "event": "UPDATE", "items": len(chunk), "builder": builder.__class__.__name__, } }, ) processed_chunk = [builder.process_item(item) for item in chunk] processed_items = [item for item in processed_chunk if item is not None] builder.update_targets(processed_items) logger.info( f"Ended serial processing: {builder.__class__.__name__}", extra={"maggma": {"event": "BUILD_ENDED", "builder": builder.__class__.__name__}}, ) builder.finalize() maggma-0.70.0/src/maggma/cli/settings.py000066400000000000000000000012721470132070100200250ustar00rootroot00000000000000import platform import tempfile from typing import Optional from pydantic import Field from pydantic_settings import BaseSettings tempdir = "/tmp" if platform.system() == "Darwin" else tempfile.gettempdir() class CLISettings(BaseSettings): WORKER_TIMEOUT: Optional[int] = Field( None, description="Timeout in seconds for a distributed worker", ) MANAGER_TIMEOUT: int = Field( 3600, description="Timeout in seconds for the worker manager", ) TEMP_DIR: str = Field( tempdir, description="Directory that memory profile .bin files are dumped to", ) class Config: env_prefix = "MAGGMA_" extra = "ignore" maggma-0.70.0/src/maggma/cli/source_loader.py000066400000000000000000000125351470132070100210170ustar00rootroot00000000000000import importlib.util import sys from glob import glob from importlib.abc import Loader, MetaPathFinder from importlib.machinery import ModuleSpec, SourceFileLoader from pathlib import Path from maggma.core import Builder try: import nbformat from IPython import get_ipython from IPython.core.interactiveshell import InteractiveShell from regex import match except ModuleNotFoundError: pass _BASENAME = "maggma.cli.sources" class ScriptFinder(MetaPathFinder): """ Special Finder designed to find custom script builders. """ @classmethod def find_spec(cls, fullname, path, target=None): if not (str(fullname).startswith(f"{_BASENAME}.")): return None # The last module is what we want to find the path for sub_path = str(fullname).split(".")[-1] segments = sub_path.split("_") file_path = next(find_matching_file(segments)) if file_path is None: return None return spec_from_source(file_path) class NotebookLoader(Loader): """Module Loader for Jupyter Notebooks or Source Files.""" def __init__(self, name=None, path=None): self.shell = InteractiveShell.instance() self.name = name self.path = path def create_module(self, spec): return None def exec_module(self, module): module.__dict__["get_ipython"] = get_ipython module.__path__ = self.path # load the notebook object with open(self.path, encoding="utf-8") as f: nb = nbformat.read(f, 4) # extra work to ensure that magics that would affect the user_ns # actually affect the notebook module's ns save_user_ns = self.shell.user_ns self.shell.user_ns = module.__dict__ try: for cell in nb.cells: if cell.cell_type == "code": # transform the input to executable Python code = self.shell.input_transformer_manager.transform_cell(cell.source) # run the code in themodule exec(code, module.__dict__) finally: self.shell.user_ns = save_user_ns return module def spec_from_source(file_path: str) -> ModuleSpec: """ Returns a ModuleSpec from a filepath for importlib loading Specialized for loading python source files and notebooks into a temporary maggma cli package to run as a builder. """ file_path_obj = Path(file_path).resolve().relative_to(Path(".").resolve()) file_path_str = str(file_path_obj) if file_path_obj.parts[-1][-3:] == ".py": # Gets module name from the filename without the .py extension module_name = "_".join(file_path_obj.parts).replace(" ", "_").replace(".py", "") spec = ModuleSpec( name=f"{_BASENAME}.{module_name}", loader=SourceFileLoader(fullname=f"{_BASENAME}.{module_name}", path=file_path_str), origin=file_path_str, ) # spec._set_fileattr = True elif file_path_obj.parts[-1][-6:] == ".ipynb": # Gets module name from the filename without the .ipnb extension module_name = "_".join(file_path_obj.parts).replace(" ", "_").replace(".ipynb", "") spec = ModuleSpec( name=f"{_BASENAME}.{module_name}", loader=NotebookLoader(name=f"{_BASENAME}.{module_name}", path=file_path_str), origin=file_path_str, ) # spec._set_fileattr = True else: raise Exception("Can't load {file_path}. Must provide a python source file such as a .py or .ipynb file") return spec def load_builder_from_source(file_path: str) -> list[Builder]: """ Loads Maggma Builders from a Python source file. """ file_path = str(Path(file_path).resolve()) spec = spec_from_source(file_path) module_object = importlib.util.module_from_spec(spec) spec.loader.exec_module(module_object) # type: ignore sys.modules[spec.name] = module_object if hasattr(module_object, "__builders__"): return module_object.__builders__ if hasattr(module_object, "__builder__"): return module_object.__builder__ raise Exception(f"No __builders__ or __builder__ attribute found in {file_path}") def find_matching_file(segments, curr_path="./"): """ Finds file that has the right sequence of segments in the path relative to the current path Requires all segments match the file path. """ # If we've gotten to the end of the segment match check to see if a file exists if len(segments) == 0: if Path(curr_path + ".py").exists(): yield curr_path + ".py" if Path(curr_path + ".ipynb").exists(): yield curr_path + ".ipynb" else: # Recurse down the segment tree some more current_segment = segments[0] remainder = segments[1:] re = rf"({curr_path}[\s_]*{current_segment})" pos_matches = [match(re, pos_path) for pos_path in glob(curr_path + "*")] pos_matches = {pmatch.group(1) for pmatch in pos_matches if pmatch} for new_path in pos_matches: if Path(new_path).exists() and Path(new_path).is_dir: for sub_match in find_matching_file(remainder, curr_path=new_path + "/"): yield sub_match for sub_match in find_matching_file(remainder, curr_path=new_path): yield sub_match maggma-0.70.0/src/maggma/cli/sources/000077500000000000000000000000001470132070100172745ustar00rootroot00000000000000maggma-0.70.0/src/maggma/cli/sources/__init__.py000066400000000000000000000000761470132070100214100ustar00rootroot00000000000000"""Dummy module to allow for loading dynamic source files.""" maggma-0.70.0/src/maggma/core/000077500000000000000000000000001470132070100157725ustar00rootroot00000000000000maggma-0.70.0/src/maggma/core/__init__.py000066400000000000000000000004261470132070100201050ustar00rootroot00000000000000"""Core specifications for Maggma.""" from maggma.core.builder import Builder from maggma.core.store import DateTimeFormat, Sort, Store, StoreError from maggma.core.validator import Validator __all__ = ["Builder", "DateTimeFormat", "Sort", "Store", "StoreError", "Validator"] maggma-0.70.0/src/maggma/core/builder.py000066400000000000000000000111541470132070100177740ustar00rootroot00000000000000""" Module containing the core builder definition. """ import logging from abc import ABCMeta, abstractmethod from collections.abc import Iterable from typing import Any, Union from monty.json import MontyDecoder, MSONable from maggma.core.store import Store, StoreError from maggma.utils import TqdmLoggingHandler, grouper, tqdm class Builder(MSONable, metaclass=ABCMeta): """ Base Builder class At minimum this class should implement: get_items - Get items from the sources update_targets - Updates the sources with results. Multiprocessing and MPI processing can be used if all the data processing is limited to process_items """ def __init__( self, sources: Union[list[Store], Store], targets: Union[list[Store], Store], chunk_size: int = 1000, ): """ Initialize the builder the framework. Arguments: sources: source Store(s) targets: target Store(s) chunk_size: chunk size for processing """ self.sources = sources if isinstance(sources, list) else [sources] self.targets = targets if isinstance(targets, list) else [targets] self.chunk_size = chunk_size self.total = None # type: Optional[int] self.logger = logging.getLogger(type(self).__name__) self.logger.addHandler(logging.NullHandler()) def connect(self): """ Connect to the builder sources and targets. """ for s in self.sources + self.targets: s.connect() def prechunk(self, number_splits: int) -> Iterable[dict]: """ Part of a domain-decomposition paradigm to allow the builder to operate on multiple nodes by dividing up the IO as well as the compute This function should return an iterator of dictionaries that can be distributed to multiple instances of the builder to get/process/update on. Arguments: number_splits: The number of groups to split the documents to work on """ self.logger.info( f"{self.__class__.__name__} doesn't have distributed processing capabilities." " Instead this builder will run on just one worker for all processing" ) raise NotImplementedError( f"{self.__class__.__name__} doesn't have distributed processing capabilities." " Instead this builder will run on just one worker for all processing" ) @abstractmethod def get_items(self) -> Iterable: """ Returns all the items to process. Returns: generator or list of items to process """ def process_item(self, item: Any) -> Any: """ Process an item. There should be no database operations in this method. Default behavior is to return the item. Arguments: item: Returns: item: an item to update """ return item @abstractmethod def update_targets(self, items: list): """ Takes a list of items from process item and updates the targets with them. Can also perform other book keeping in the process such as storing gridfs oids, etc. Arguments: items: Returns: """ def finalize(self): """ Perform any final clean up. """ # Close any Mongo connections. for store in self.sources + self.targets: try: store.close() except (AttributeError, StoreError): continue def run(self, log_level=logging.DEBUG): """ Run the builder serially This is only intended for diagnostic purposes. """ # Set up logging root = logging.getLogger() root.setLevel(log_level) ch = TqdmLoggingHandler() formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") ch.setFormatter(formatter) root.addHandler(ch) self.connect() cursor = self.get_items() for chunk in grouper(tqdm(cursor), self.chunk_size): self.logger.info(f"Processing batch of {self.chunk_size} items") processed_chunk = [self.process_item(item) for item in chunk] processed_items = [item for item in processed_chunk if item is not None] self.update_targets(processed_items) self.finalize() def __getstate__(self): return self.as_dict() def __setstate__(self, d): d = {k: v for k, v in d.items() if not k.startswith("@")} d = MontyDecoder().process_decoded(d) self.__init__(**d) maggma-0.70.0/src/maggma/core/store.py000066400000000000000000000256401470132070100175070ustar00rootroot00000000000000""" Module containing the core Store definition. """ import logging from abc import ABCMeta, abstractmethod, abstractproperty from collections.abc import Iterator from datetime import datetime from enum import Enum from typing import Callable, Optional, Union from monty.dev import deprecated from monty.json import MontyDecoder, MSONable from pydash import get, has, identity from maggma.core.validator import Validator from maggma.utils import LU_KEY_ISOFORMAT class Sort(Enum): """Enumeration for sorting order.""" Ascending = 1 Descending = -1 class DateTimeFormat(Enum): """Datetime format in store document.""" DateTime = "datetime" IsoFormat = "isoformat" class Store(MSONable, metaclass=ABCMeta): """ Abstract class for a data Store Defines the interface for all data going in and out of a Builder. """ def __init__( self, key: str = "task_id", last_updated_field: str = "last_updated", last_updated_type: DateTimeFormat = DateTimeFormat("datetime"), # noqa: B008 validator: Optional[Validator] = None, ): """ Args: key: main key to index on last_updated_field: field for date/time stamping the data last_updated_type: the date/time format for the last_updated_field. Can be "datetime" or "isoformat" validator: Validator to validate documents going into the store. """ self.key = key self.last_updated_field = last_updated_field self.last_updated_type = last_updated_type self._lu_func: tuple[Callable, Callable] = ( LU_KEY_ISOFORMAT if DateTimeFormat(last_updated_type) == DateTimeFormat.IsoFormat else (identity, identity) ) self.validator = validator self.logger = logging.getLogger(type(self).__name__) self.logger.addHandler(logging.NullHandler()) @abstractproperty def _collection(self): """ Returns a handle to the pymongo collection object. """ @abstractproperty def name(self) -> str: """ Return a string representing this data source. """ @abstractmethod def connect(self, force_reset: bool = False): """ Connect to the source data. Args: force_reset: whether to reset the connection or not """ @abstractmethod def close(self): """ Closes any connections. """ @abstractmethod def count(self, criteria: Optional[dict] = None) -> int: """ Counts the number of documents matching the query criteria. Args: criteria: PyMongo filter for documents to count in """ @abstractmethod def query( self, criteria: Optional[dict] = None, properties: Union[dict, list, None] = None, sort: Optional[dict[str, Union[Sort, int]]] = None, skip: int = 0, limit: int = 0, ) -> Iterator[dict]: """ Queries the Store for a set of documents. Args: criteria: PyMongo filter for documents to search in properties: properties to return in grouped documents sort: Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending. skip: number documents to skip limit: limit on total number of documents returned """ @abstractmethod def update(self, docs: Union[list[dict], dict], key: Union[list, str, None] = None): """ Update documents into the Store. Args: docs: the document or list of documents to update key: field name(s) to determine uniqueness for a document, can be a list of multiple fields, a single field, or None if the Store's key field is to be used """ @abstractmethod def ensure_index(self, key: str, unique: bool = False) -> bool: """ Tries to create an index and return true if it succeeded. Args: key: single key to index unique: Whether or not this index contains only unique keys Returns: bool indicating if the index exists/was created """ @abstractmethod def groupby( self, keys: Union[list[str], str], criteria: Optional[dict] = None, properties: Union[dict, list, None] = None, sort: Optional[dict[str, Union[Sort, int]]] = None, skip: int = 0, limit: int = 0, ) -> Iterator[tuple[dict, list[dict]]]: """ Simple grouping function that will group documents by keys. Args: keys: fields to group documents criteria: PyMongo filter for documents to search in properties: properties to return in grouped documents sort: Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending. skip: number documents to skip limit: limit on total number of documents returned Returns: generator returning tuples of (dict, list of docs) """ @abstractmethod def remove_docs(self, criteria: dict): """ Remove docs matching the query dictionary. Args: criteria: query dictionary to match """ def query_one( self, criteria: Optional[dict] = None, properties: Union[dict, list, None] = None, sort: Optional[dict[str, Union[Sort, int]]] = None, ): """ Queries the Store for a single document. Args: criteria: PyMongo filter for documents to search properties: properties to return in the document sort: Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending. """ return next(self.query(criteria=criteria, properties=properties, sort=sort), None) def distinct(self, field: str, criteria: Optional[dict] = None, all_exist: bool = False) -> list: """ Get all distinct values for a field. Args: field: the field(s) to get distinct values for criteria: PyMongo filter for documents to search in """ criteria = criteria or {} results = [key for key, _ in self.groupby(field, properties=[field], criteria=criteria)] return [get(r, field) for r in results] @property def last_updated(self) -> datetime: """ Provides the most recent last_updated date time stamp from the documents in this Store. """ doc = next( self.query( properties=[self.last_updated_field], sort={self.last_updated_field: -1}, limit=1, ), None, ) if doc and not has(doc, self.last_updated_field): raise StoreError( f"No field '{self.last_updated_field}' in store document. Please ensure Store.last_updated_field " "is a datetime field in your store that represents the time of " "last update to each document." ) if not doc or get(doc, self.last_updated_field) is None: # Handle when collection has docs but `NoneType` last_updated_field. return datetime.min return self._lu_func[0](get(doc, self.last_updated_field)) def newer_in(self, target: "Store", criteria: Optional[dict] = None, exhaustive: bool = False) -> list[str]: """ Returns the keys of documents that are newer in the target Store than this Store. Args: target: target Store to criteria: PyMongo filter for documents to search in exhaustive: triggers an item-by-item check vs. checking the last_updated of the target Store and using that to filter out new items in """ self.ensure_index(self.key) self.ensure_index(self.last_updated_field) if exhaustive: # Get our current last_updated dates for each key value props = {self.key: 1, self.last_updated_field: 1, "_id": 0} dates = { d[self.key]: self._lu_func[0](d.get(self.last_updated_field, datetime.max)) for d in self.query(properties=props) } # Get the last_updated for the store we're comparing with props = {target.key: 1, target.last_updated_field: 1, "_id": 0} target_dates = { d[target.key]: target._lu_func[0](d.get(target.last_updated_field, datetime.min)) for d in target.query(criteria=criteria, properties=props) } new_keys = set(target_dates.keys()) - set(dates.keys()) updated_keys = {key for key, date in dates.items() if target_dates.get(key, datetime.min) > date} return list(new_keys | updated_keys) criteria = {self.last_updated_field: {"$gt": self._lu_func[1](self.last_updated)}} return target.distinct(field=self.key, criteria=criteria) @deprecated(message="Please use Store.newer_in") def lu_filter(self, targets): """Creates a MongoDB filter for new documents. By "new", we mean documents in this Store that were last updated later than any document in targets. Args: targets (list): A list of Stores """ if isinstance(targets, Store): targets = [targets] lu_list = [t.last_updated for t in targets] return {self.last_updated_field: {"$gt": self._lu_func[1](max(lu_list))}} @deprecated(message="Use Store.newer_in") def updated_keys(self, target, criteria=None): """ Returns keys for docs that are newer in the target store in comparison with this store when comparing the last updated field (last_updated_field). Args: target (Store): store to look for updated documents criteria (dict): mongo query to limit scope Returns: list of keys that have been updated in target store """ self.ensure_index(self.key) self.ensure_index(self.last_updated_field) return self.newer_in(target, criteria=criteria) def __ne__(self, other): return not self == other def __getstate__(self): return self.as_dict() def __setstate__(self, d): d = {k: v for k, v in d.items() if not k.startswith("@")} d = MontyDecoder().process_decoded(d) self.__init__(**d) def __enter__(self): self.connect() return self def __exit__(self, exception_type, exception_value, traceback): self.close() class StoreError(Exception): """General Store-related error.""" def __init__(self, *args, **kwargs): super().__init__(self, *args, **kwargs) maggma-0.70.0/src/maggma/core/validator.py000066400000000000000000000017761470132070100203440ustar00rootroot00000000000000""" Validator class for document-level validation on Stores. Attach an instance of a Validator subclass to a Store .schema variable to enable validation on that Store. """ from abc import ABCMeta, abstractmethod from monty.json import MSONable class Validator(MSONable, metaclass=ABCMeta): """ A generic class to perform document-level validation on Stores. Attach a Validator to a Store during initialization, any all documents added to the Store will call .validate_doc() before being added. """ @abstractmethod def is_valid(self, doc: dict) -> bool: """ Determines if the document is valid. Args: doc: document to check """ @abstractmethod def validation_errors(self, doc: dict) -> list[str]: """ If document is not valid, provides a list of strings to display for why validation has failed. Returns empty list if the document is valid Args: doc: document to check """ maggma-0.70.0/src/maggma/py.typed000066400000000000000000000000001470132070100165270ustar00rootroot00000000000000maggma-0.70.0/src/maggma/stores/000077500000000000000000000000001470132070100163615ustar00rootroot00000000000000maggma-0.70.0/src/maggma/stores/__init__.py000066400000000000000000000015221470132070100204720ustar00rootroot00000000000000"""Root store module with easy imports for implemented Stores.""" from maggma.core import Store from maggma.stores.advanced_stores import AliasingStore, MongograntStore, SandboxStore, VaultStore from maggma.stores.aws import S3Store from maggma.stores.azure import AzureBlobStore from maggma.stores.compound_stores import ConcatStore, JointStore from maggma.stores.file_store import FileStore from maggma.stores.gridfs import GridFSStore from maggma.stores.mongolike import JSONStore, MemoryStore, MongoStore, MongoURIStore, MontyStore __all__ = [ "Store", "AliasingStore", "MongograntStore", "SandboxStore", "VaultStore", "S3Store", "AzureBlobStore", "ConcatStore", "JointStore", "GridFSStore", "FileStore", "JSONStore", "MemoryStore", "MongoStore", "MongoURIStore", "MontyStore", ] maggma-0.70.0/src/maggma/stores/advanced_stores.py000066400000000000000000000437031470132070100221060ustar00rootroot00000000000000""" Advanced Stores for behavior outside normal access patterns. """ import json import os from collections.abc import Iterator from typing import Optional, Union from monty.dev import deprecated, requires from maggma.core import Sort, Store, StoreError from maggma.stores.mongolike import MongoStore from maggma.utils import lazy_substitute, substitute try: import hvac except ImportError: hvac = None try: from mongogrant import Client from mongogrant.client import check from mongogrant.config import Config except ImportError: Client = None @deprecated(MongoStore) class MongograntStore(MongoStore): """Initialize a Store with a mongogrant "``:``/``." spec. Some class methods of MongoStore, e.g. from_db_file and from_collection, are not supported. mongogrant documentation: https://github.com/materialsproject/mongogrant """ @requires( Client is not None, "mongogrant is required to use MongoGrantStore. Please run `pip install maggma[mongogrant]", ) def __init__( self, mongogrant_spec: str, collection_name: str, mgclient_config_path: Optional[str] = None, **kwargs, ): """ Args: mongogrant_spec: of the form ``:``/``, where role is one of {"read", "readWrite"} or aliases {"ro", "rw"}; host is a db host (w/ optional port) or alias; and db is a db on that host, or alias. See mongogrant documentation. collection_name: name of mongo collection mgclient_config_path: Path to mongogrant client config file, or None if default path (`mongogrant.client.path`). """ self.mongogrant_spec = mongogrant_spec self.collection_name = collection_name self.mgclient_config_path = mgclient_config_path self._coll = None if self.mgclient_config_path: config = Config(check=check, path=self.mgclient_config_path) client = Client(config) else: client = Client() if {"username", "password", "database", "host"} & set(kwargs): raise StoreError( "MongograntStore does not accept " "username, password, database, or host " "arguments. Use `mongogrant_spec`." ) self.kwargs = kwargs _auth_info = client.get_db_auth_from_spec(self.mongogrant_spec) super().__init__( host=_auth_info["host"], database=_auth_info["authSource"], username=_auth_info["username"], password=_auth_info["password"], collection_name=self.collection_name, **kwargs, ) @property def name(self): return f"mgrant://{self.mongogrant_spec}/{self.collection_name}" def __hash__(self): return hash((self.mongogrant_spec, self.collection_name, self.last_updated_field)) @classmethod def from_db_file(cls, file): """ Raises ValueError since MongograntStores can't be initialized from a file. """ raise ValueError("MongograntStore doesn't implement from_db_file") @classmethod def from_collection(cls, collection): """ Raises ValueError since MongograntStores can't be initialized from a PyMongo collection. """ raise ValueError("MongograntStore doesn't implement from_collection") def __eq__(self, other: object) -> bool: """ Check equality for MongograntStore. Args: other: other MongograntStore to compare with """ if not isinstance(other, MongograntStore): return False fields = [ "mongogrant_spec", "collection_name", "mgclient_config_path", "last_updated_field", ] return all(getattr(self, f) == getattr(other, f) for f in fields) class VaultStore(MongoStore): """ Extends MongoStore to read credentials out of Vault server and uses these values to initialize MongoStore instance. """ @requires(hvac is not None, "hvac is required to use VaultStore") def __init__(self, collection_name: str, vault_secret_path: str): """ Args: collection_name: name of mongo collection vault_secret_path: path on vault server with mongo creds object. Important: Environment variables that must be set prior to invocation VAULT_ADDR - URL of vault server (eg. https://matgen8.lbl.gov:8200) VAULT_TOKEN or GITHUB_TOKEN - token used to authenticate to vault """ self.collection_name = collection_name self.vault_secret_path = vault_secret_path # TODO: Switch this over to Pydantic ConfigSettings vault_addr = os.getenv("VAULT_ADDR") if not vault_addr: raise RuntimeError("VAULT_ADDR not set") client = hvac.Client(vault_addr) # If we have a vault token use this token = os.getenv("VAULT_TOKEN") # Look for a github token instead if not token: github_token = os.getenv("GITHUB_TOKEN") if github_token: client.auth_github(github_token) else: raise RuntimeError("VAULT_TOKEN or GITHUB_TOKEN not set") else: client.token = token if not client.is_authenticated(): raise RuntimeError("Bad token") # Read the vault secret json_db_creds = client.read(vault_secret_path) db_creds = json.loads(json_db_creds["data"]["value"]) database = db_creds.get("db") host = db_creds.get("host", "localhost") port = db_creds.get("port", 27017) username = db_creds.get("username", "") password = db_creds.get("password", "") super().__init__(database, collection_name, host, port, username, password) def __eq__(self, other: object) -> bool: """ Check equality for VaultStore. Args: other: other VaultStore to compare with """ if not isinstance(other, VaultStore): return False fields = ["vault_secret_path", "collection_name", "last_updated_field"] return all(getattr(self, f) == getattr(other, f) for f in fields) class AliasingStore(Store): """ Special Store that aliases for the primary accessors. """ def __init__(self, store: Store, aliases: dict, **kwargs): """ Args: store: the store to wrap around aliases: dict of aliases of the form external key: internal key. """ self.store = store # Given an external key tells what the internal key is self.aliases = aliases # Given the internal key tells us what the external key is self.reverse_aliases = {v: k for k, v in aliases.items()} self.kwargs = kwargs kwargs.update( { "last_updated_field": store.last_updated_field, "last_updated_type": store.last_updated_type, } ) super().__init__(**kwargs) @property def name(self) -> str: """ Return a string representing this data source. """ return self.store.name def count(self, criteria: Optional[dict] = None) -> int: """ Counts the number of documents matching the query criteria. Args: criteria: PyMongo filter for documents to count in """ criteria = criteria if criteria else {} lazy_substitute(criteria, self.reverse_aliases) return self.store.count(criteria) def query( self, criteria: Optional[dict] = None, properties: Union[dict, list, None] = None, sort: Optional[dict[str, Union[Sort, int]]] = None, skip: int = 0, limit: int = 0, ) -> Iterator[dict]: """ Queries the Store for a set of documents. Args: criteria: PyMongo filter for documents to search in properties: properties to return in grouped documents sort: Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending. skip: number documents to skip limit: limit on total number of documents returned """ criteria = criteria if criteria else {} if properties is not None: if isinstance(properties, list): properties = {p: 1 for p in properties} substitute(properties, self.reverse_aliases) lazy_substitute(criteria, self.reverse_aliases) for d in self.store.query(properties=properties, criteria=criteria, sort=sort, limit=limit, skip=skip): substitute(d, self.aliases) yield d def distinct(self, field: str, criteria: Optional[dict] = None, all_exist: bool = False) -> list: """ Get all distinct values for a field. Args: field: the field(s) to get distinct values for criteria: PyMongo filter for documents to search in """ criteria = criteria if criteria else {} lazy_substitute(criteria, self.reverse_aliases) # substitute forward return self.store.distinct(self.aliases[field], criteria=criteria) def groupby( self, keys: Union[list[str], str], criteria: Optional[dict] = None, properties: Union[dict, list, None] = None, sort: Optional[dict[str, Union[Sort, int]]] = None, skip: int = 0, limit: int = 0, ) -> Iterator[tuple[dict, list[dict]]]: """ Simple grouping function that will group documents by keys. Args: keys: fields to group documents criteria: PyMongo filter for documents to search in properties: properties to return in grouped documents sort: Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending. skip: number documents to skip limit: limit on total number of documents returned Returns: generator returning tuples of (dict, list of docs) """ # Convert to a list keys = keys if isinstance(keys, list) else [keys] # Make the aliasing transformations on keys keys = [self.aliases.get(k, k) for k in keys] # Update criteria and properties based on aliases criteria = criteria if criteria else {} if properties is not None: if isinstance(properties, list): properties = {p: 1 for p in properties} substitute(properties, self.reverse_aliases) lazy_substitute(criteria, self.reverse_aliases) return self.store.groupby(keys=keys, properties=properties, criteria=criteria, skip=skip, limit=limit) def update(self, docs: Union[list[dict], dict], key: Union[list, str, None] = None): """ Update documents into the Store. Args: docs: the document or list of documents to update key: field name(s) to determine uniqueness for a document, can be a list of multiple fields, a single field, or None if the Store's key field is to be used """ key = key if key else self.key for d in docs: substitute(d, self.reverse_aliases) if key in self.aliases: key = self.aliases[key] self.store.update(docs, key=key) def remove_docs(self, criteria: dict): """ Remove docs matching the query dictionary. Args: criteria: query dictionary to match """ # Update criteria and properties based on aliases lazy_substitute(criteria, self.reverse_aliases) self.store.remove_docs(criteria) def ensure_index(self, key, unique=False, **kwargs): if key in self.aliases: key = self.aliases return self.store.ensure_index(key, unique, **kwargs) def close(self): self.store.close() @property def _collection(self): return self.store._collection def connect(self, force_reset=False): self.store.connect(force_reset=force_reset) def __eq__(self, other: object) -> bool: """ Check equality for AliasingStore. Args: other: other AliasingStore to compare with """ if not isinstance(other, AliasingStore): return False fields = ["store", "aliases", "last_updated_field"] return all(getattr(self, f) == getattr(other, f) for f in fields) class SandboxStore(Store): """ Provides a sandboxed view to another store. """ def __init__(self, store: Store, sandbox: str, exclusive: bool = False): """ Args: store: store to wrap sandboxing around sandbox: the corresponding sandbox exclusive: whether to be exclusively in this sandbox or include global items. """ self.store = store self.sandbox = sandbox self.exclusive = exclusive super().__init__( key=self.store.key, last_updated_field=self.store.last_updated_field, last_updated_type=self.store.last_updated_type, validator=self.store.validator, ) @property def name(self) -> str: """ Returns: a string representing this data source. """ return f"Sandbox[{self.store.name}][{self.sandbox}]" @property def sbx_criteria(self) -> dict: """ Returns: the sandbox criteria dict used to filter the source store. """ if self.exclusive: return {"sbxn": self.sandbox} return {"$or": [{"sbxn": {"$in": [self.sandbox]}}, {"sbxn": {"$exists": False}}]} def count(self, criteria: Optional[dict] = None) -> int: """ Counts the number of documents matching the query criteria. Args: criteria: PyMongo filter for documents to count in """ criteria = dict(**criteria, **self.sbx_criteria) if criteria else self.sbx_criteria return self.store.count(criteria=criteria) def query( self, criteria: Optional[dict] = None, properties: Union[dict, list, None] = None, sort: Optional[dict[str, Union[Sort, int]]] = None, skip: int = 0, limit: int = 0, ) -> Iterator[dict]: """ Queries the Store for a set of documents. Args: criteria: PyMongo filter for documents to search in properties: properties to return in grouped documents sort: Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending. skip: number documents to skip limit: limit on total number of documents returned """ criteria = dict(**criteria, **self.sbx_criteria) if criteria else self.sbx_criteria return self.store.query(properties=properties, criteria=criteria, sort=sort, limit=limit, skip=skip) def groupby( self, keys: Union[list[str], str], criteria: Optional[dict] = None, properties: Union[dict, list, None] = None, sort: Optional[dict[str, Union[Sort, int]]] = None, skip: int = 0, limit: int = 0, ) -> Iterator[tuple[dict, list[dict]]]: """ Simple grouping function that will group documents by keys. Args: keys: fields to group documents criteria: PyMongo filter for documents to search in properties: properties to return in grouped documents sort: Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending. skip: number documents to skip limit: limit on total number of documents returned Returns: generator returning tuples of (dict, list of docs) """ criteria = dict(**criteria, **self.sbx_criteria) if criteria else self.sbx_criteria return self.store.groupby(keys=keys, properties=properties, criteria=criteria, skip=skip, limit=limit) def update(self, docs: Union[list[dict], dict], key: Union[list, str, None] = None): """ Update documents into the Store. Args: docs: the document or list of documents to update key: field name(s) to determine uniqueness for a document, can be a list of multiple fields, a single field, or None if the Store's key field is to be used """ for d in docs: if "sbxn" in d: d["sbxn"] = list(set(d["sbxn"] + [self.sandbox])) else: d["sbxn"] = [self.sandbox] self.store.update(docs, key=key) def remove_docs(self, criteria: dict): """ Remove docs matching the query dictionary. Args: criteria: query dictionary to match """ # Update criteria and properties based on aliases criteria = dict(**criteria, **self.sbx_criteria) if criteria else self.sbx_criteria self.store.remove_docs(criteria) def ensure_index(self, key, unique=False, **kwargs): return self.store.ensure_index(key, unique, **kwargs) def close(self): self.store.close() @property def _collection(self): return self.store._collection def connect(self, force_reset=False): self.store.connect(force_reset=force_reset) def __eq__(self, other: object) -> bool: """ Check equality for SandboxStore. Args: other: other SandboxStore to compare with """ if not isinstance(other, SandboxStore): return False fields = ["store", "sandbox", "last_updated_field"] return all(getattr(self, f) == getattr(other, f) for f in fields) maggma-0.70.0/src/maggma/stores/aws.py000066400000000000000000000543771470132070100175450ustar00rootroot00000000000000"""Stores for connecting to AWS data.""" import threading import warnings import zlib from collections.abc import Iterator from concurrent.futures import wait from concurrent.futures.thread import ThreadPoolExecutor from hashlib import sha1 from io import BytesIO from json import dumps from typing import Any, Callable, Optional, Union import msgpack # type: ignore from monty.msgpack import default as monty_default from maggma.core import Sort, Store from maggma.stores.ssh_tunnel import SSHTunnel from maggma.utils import grouper, to_isoformat_ceil_ms try: import boto3 import botocore from boto3.session import Session from botocore.exceptions import ClientError except (ImportError, ModuleNotFoundError): boto3 = None # type: ignore class S3Store(Store): """ GridFS like storage using Amazon S3 and a regular store for indexing. Assumes Amazon AWS key and secret key are set in environment or default config file. """ def __init__( self, index: Store, bucket: str, s3_profile: Optional[Union[str, dict]] = None, compress: bool = False, endpoint_url: Optional[str] = None, sub_dir: Optional[str] = None, s3_workers: int = 1, s3_resource_kwargs: Optional[dict] = None, ssh_tunnel: Optional[SSHTunnel] = None, key: str = "fs_id", store_hash: bool = True, unpack_data: bool = True, searchable_fields: Optional[list[str]] = None, index_store_kwargs: Optional[dict] = None, **kwargs, ): """ Initializes an S3 Store. Args: index: a store to use to index the S3 bucket. bucket: name of the bucket. s3_profile: name of AWS profile containing the credentials. Alternatively you can pass in a dictionary with the full credentials: aws_access_key_id (string) -- AWS access key ID aws_secret_access_key (string) -- AWS secret access key aws_session_token (string) -- AWS temporary session token region_name (string) -- Default region when creating new connections compress: compress files inserted into the store. endpoint_url: this allows the interface with minio service; ignored if `ssh_tunnel` is provided, in which case it is inferred. sub_dir: subdirectory of the S3 bucket to store the data. s3_workers: number of concurrent S3 puts to run. s3_resource_kwargs: additional kwargs to pass to the boto3 session resource. ssh_tunnel: optional SSH tunnel to use for the S3 connection. key: main key to index on. store_hash: store the SHA1 hash right before insertion to the database. unpack_data: whether to decompress and unpack byte data when querying from the bucket. searchable_fields: fields to keep in the index store. index_store_kwargs: kwargs to pass to the index store. Allows the user to use kwargs here to update the index store. """ if boto3 is None: raise RuntimeError("boto3 and botocore are required for S3Store") self.index_store_kwargs = index_store_kwargs or {} if index_store_kwargs: d_ = index.as_dict() d_.update(index_store_kwargs) self.index = index.__class__.from_dict(d_) else: self.index = index self.bucket = bucket self.s3_profile = s3_profile self.compress = compress self.endpoint_url = endpoint_url self.sub_dir = sub_dir.strip("/") + "/" if sub_dir else "" self.s3: Any = None self.s3_bucket: Any = None self.s3_workers = s3_workers self.s3_resource_kwargs = s3_resource_kwargs if s3_resource_kwargs is not None else {} self.ssh_tunnel = ssh_tunnel self.unpack_data = unpack_data self.searchable_fields = searchable_fields if searchable_fields is not None else [] self.store_hash = store_hash # Force the key to be the same as the index assert isinstance(index.key, str), "Since we are using the key as a file name in S3, they key must be a string" if key != index.key: warnings.warn( f'The desired S3Store key "{key}" does not match the index key "{index.key},"' "the index key will be used", UserWarning, ) kwargs["key"] = str(index.key) self._thread_local = threading.local() super().__init__(**kwargs) @property def name(self) -> str: """String representing this data source.""" return f"s3://{self.bucket}" def connect(self, force_reset: bool = False): # lgtm[py/conflicting-attributes] """Connect to the source data. Args: force_reset: whether to force a reset of the connection """ if self.s3 is None or force_reset: self.s3, self.s3_bucket = self._get_resource_and_bucket() self.index.connect(force_reset=force_reset) def close(self): """Closes any connections.""" self.index.close() self.s3.meta.client.close() self.s3 = None self.s3_bucket = None if self.ssh_tunnel is not None: self.ssh_tunnel.stop() @property def _collection(self): """ A handle to the pymongo collection object. Important: Not guaranteed to exist in the future. """ # For now returns the index collection since that is what we would "search" on return self.index._collection def count(self, criteria: Optional[dict] = None) -> int: """ Counts the number of documents matching the query criteria. Args: criteria: PyMongo filter for documents to count in. """ return self.index.count(criteria) def query( self, criteria: Optional[dict] = None, properties: Union[dict, list, None] = None, sort: Optional[dict[str, Union[Sort, int]]] = None, skip: int = 0, limit: int = 0, ) -> Iterator[dict]: """ Queries the Store for a set of documents. Args: criteria: PyMongo filter for documents to search in. properties: properties to return in grouped documents. sort: Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending. skip: number documents to skip. limit: limit on total number of documents returned. """ prop_keys = set() if isinstance(properties, dict): prop_keys = set(properties.keys()) elif isinstance(properties, list): prop_keys = set(properties) for doc in self.index.query(criteria=criteria, sort=sort, limit=limit, skip=skip): if properties is not None and prop_keys.issubset(set(doc.keys())): yield {p: doc[p] for p in properties if p in doc} else: try: # TODO: This is ugly and unsafe, do some real checking before pulling data data = self.s3_bucket.Object(self._get_full_key_path(doc[self.key])).get()["Body"].read() except botocore.exceptions.ClientError as e: # If a client error is thrown, then check that it was a NoSuchKey or NoSuchBucket error. # If it was a NoSuchKey error, then the object does not exist. error_code = e.response["Error"]["Code"] if error_code in ["NoSuchKey", "NoSuchBucket"]: error_message = e.response["Error"]["Message"] self.logger.error( f"S3 returned '{error_message}' while querying '{self.bucket}' for '{doc[self.key]}'" ) continue else: raise e if self.unpack_data: data = self._read_data(data=data, compress_header=doc.get("compression", "")) if self.last_updated_field in doc: data[self.last_updated_field] = doc[self.last_updated_field] yield data def _read_data(self, data: bytes, compress_header: str) -> dict: """Reads the data and transforms it into a dictionary. Allows for subclasses to apply custom schemes for transforming the data retrieved from S3. Args: data (bytes): The raw byte representation of the data. compress_header (str): String representing the type of compression used on the data. Returns: Dict: Dictionary representation of the data. """ return self._unpack(data=data, compressed=compress_header == "zlib") @staticmethod def _unpack(data: bytes, compressed: bool): if compressed: data = zlib.decompress(data) # requires msgpack-python to be installed to fix string encoding problem # https://github.com/msgpack/msgpack/issues/121 # During recursion # msgpack.unpackb goes as deep as possible during reconstruction # MontyDecoder().process_decode only goes until it finds a from_dict # as such, we cannot just use msgpack.unpackb(data, object_hook=monty_object_hook, raw=False) # Should just return the unpacked object then let the user run process_decoded return msgpack.unpackb(data, raw=False) def distinct(self, field: str, criteria: Optional[dict] = None, all_exist: bool = False) -> list: """ Get all distinct values for a field. Args: field: the field(s) to get distinct values for. criteria: PyMongo filter for documents to search in. """ # Index is a store so it should have its own distinct function return self.index.distinct(field, criteria=criteria) def groupby( self, keys: Union[list[str], str], criteria: Optional[dict] = None, properties: Union[dict, list, None] = None, sort: Optional[dict[str, Union[Sort, int]]] = None, skip: int = 0, limit: int = 0, ) -> Iterator[tuple[dict, list[dict]]]: """ Simple grouping function that will group documents by keys. Args: keys: fields to group documents. criteria: PyMongo filter for documents to search in. properties: properties to return in grouped documents. sort: Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending. skip: number documents to skip. limit: limit on total number of documents returned. Returns: generator returning tuples of (dict, list of docs) """ return self.index.groupby( keys=keys, criteria=criteria, properties=properties, sort=sort, skip=skip, limit=limit, ) def ensure_index(self, key: str, unique: bool = False) -> bool: """ Tries to create an index and return true if it succeeded. Args: key: single key to index. unique: whether this index contains only unique keys. Returns: bool indicating if the index exists/was created. """ return self.index.ensure_index(key, unique=unique) def update( self, docs: Union[list[dict], dict], key: Union[list, str, None] = None, additional_metadata: Union[str, list[str], None] = None, ): """ Update documents into the Store. Args: docs: the document or list of documents to update. key: field name(s) to determine uniqueness for a document, can be a list of multiple fields, a single field, or None if the Store's key field is to be used. additional_metadata: field(s) to include in the S3 store's metadata. """ if not isinstance(docs, list): docs = [docs] if isinstance(key, str): key = [key] elif not key: key = [self.key] if additional_metadata is None: additional_metadata = [] elif isinstance(additional_metadata, str): additional_metadata = [additional_metadata] else: additional_metadata = list(additional_metadata) self._write_to_s3_and_index(docs, key + additional_metadata + self.searchable_fields) def _write_to_s3_and_index(self, docs: list[dict], search_keys: list[str]): """Implements updating of the provided documents in S3 and the index. Allows for subclasses to apply custom approaches to parellizing the writing. Args: docs (List[Dict]): The documents to update search_keys (List[str]): The keys of the information to be updated in the index """ with ThreadPoolExecutor(max_workers=self.s3_workers) as pool: fs = { pool.submit( self.write_doc_to_s3, doc=itr_doc, search_keys=search_keys, ) for itr_doc in docs } fs, _ = wait(fs) search_docs = [sdoc.result() for sdoc in fs] # Use store's update to remove key clashes self.index.update(search_docs, key=self.key) def _get_session(self): if self.ssh_tunnel is not None: self.ssh_tunnel.start() if not hasattr(self._thread_local, "s3_bucket"): if isinstance(self.s3_profile, dict): return Session(**self.s3_profile) return Session(profile_name=self.s3_profile) return None def _get_endpoint_url(self): if self.ssh_tunnel is None: return self.endpoint_url host, port = self.ssh_tunnel.local_address return f"http://{host}:{port}" def _get_bucket(self): """If on the main thread return the bucket created above, else create a new bucket on each thread. """ if threading.current_thread().name == "MainThread": return self.s3_bucket if not hasattr(self._thread_local, "s3_bucket"): _, bucket = self._get_resource_and_bucket() self._thread_local.s3_bucket = bucket return self._thread_local.s3_bucket def _get_resource_and_bucket(self): """Helper function to create the resource and bucket objects.""" session = self._get_session() endpoint_url = self._get_endpoint_url() resource = session.resource("s3", endpoint_url=endpoint_url, **self.s3_resource_kwargs) try: resource.meta.client.head_bucket(Bucket=self.bucket) except ClientError: raise RuntimeError("Bucket not present on AWS") bucket = resource.Bucket(self.bucket) return resource, bucket def _get_full_key_path(self, id: str) -> str: """Produces the full key path for S3 items. Args: id (str): The value of the key identifier. Returns: str: The full key path """ return self.sub_dir + str(id) def _get_compression_function(self) -> Callable: """Returns the function to use for compressing data.""" return zlib.compress def _get_decompression_function(self) -> Callable: """Returns the function to use for decompressing data.""" return zlib.decompress def write_doc_to_s3(self, doc: dict, search_keys: list[str]) -> dict: """ Write the data to s3 and return the metadata to be inserted into the index db. Args: doc: the document. search_keys: list of keys to pull from the docs and be inserted into the index db. Returns: Dict: The metadata to be inserted into the index db """ s3_bucket = self._get_bucket() search_doc = {k: doc[k] for k in search_keys} search_doc[self.key] = doc[self.key] # Ensure key is in metadata if self.sub_dir != "": search_doc["sub_dir"] = self.sub_dir # Remove MongoDB _id from search if "_id" in search_doc: del search_doc["_id"] # to make hashing more meaningful, make sure last updated field is removed lu_info = doc.pop(self.last_updated_field, None) data = msgpack.packb(doc, default=monty_default) if self.compress: # Compress with zlib if chosen search_doc["compression"] = "zlib" data = self._get_compression_function()(data) # keep a record of original keys, in case these are important for the individual researcher # it is not expected that this information will be used except in disaster recovery s3_to_mongo_keys = {k: self._sanitize_key(k) for k in search_doc} s3_to_mongo_keys["s3-to-mongo-keys"] = "s3-to-mongo-keys" # inception # encode dictionary since values have to be strings search_doc["s3-to-mongo-keys"] = dumps(s3_to_mongo_keys) s3_bucket.upload_fileobj( Fileobj=BytesIO(data), Key=self._get_full_key_path(str(doc[self.key])), ExtraArgs={"Metadata": {s3_to_mongo_keys[k]: str(v) for k, v in search_doc.items()}}, ) if lu_info is not None: search_doc[self.last_updated_field] = lu_info if self.store_hash: hasher = sha1() hasher.update(data) obj_hash = hasher.hexdigest() search_doc["obj_hash"] = obj_hash return search_doc @staticmethod def _sanitize_key(key): """Sanitize keys to store in S3/MinIO metadata.""" # Any underscores are encoded as double dashes in metadata, since keys with # underscores may be result in the corresponding HTTP header being stripped # by certain server configurations (e.g. default nginx), leading to: # `botocore.exceptions.ClientError: An error occurred (AccessDenied) when # calling the PutObject operation: There were headers present in the request # which were not signed` # Metadata stored in the MongoDB index (self.index) is stored unchanged. # Additionally, MinIO requires lowercase keys return str(key).replace("_", "-").lower() def remove_docs(self, criteria: dict, remove_s3_object: bool = False): """ Remove docs matching the query dictionary. Args: criteria: query dictionary to match. remove_s3_object: whether to remove the actual S3 object or not. """ if not remove_s3_object: self.index.remove_docs(criteria=criteria) else: to_remove = self.index.distinct(self.key, criteria=criteria) self.index.remove_docs(criteria=criteria) # Can remove up to 1000 items at a time via boto to_remove_chunks = list(grouper(to_remove, n=1000)) for chunk_to_remove in to_remove_chunks: objlist = [{"Key": self._get_full_key_path(obj)} for obj in chunk_to_remove] self.s3_bucket.delete_objects(Delete={"Objects": objlist}) @property def last_updated(self): return self.index.last_updated def newer_in(self, target: Store, criteria: Optional[dict] = None, exhaustive: bool = False) -> list[str]: """ Returns the keys of documents that are newer in the target Store than this Store. Args: target: target Store. criteria: PyMongo filter for documents to search in. exhaustive: triggers an item-by-item check vs. checking the last_updated of the target Store and using that to filter out new items in. """ if hasattr(target, "index"): return self.index.newer_in(target=target.index, criteria=criteria, exhaustive=exhaustive) return self.index.newer_in(target=target, criteria=criteria, exhaustive=exhaustive) def __hash__(self): return hash((self.index.__hash__, self.bucket)) def rebuild_index_from_s3_data(self, **kwargs): """ Rebuilds the index Store from the data in S3. Relies on the index document being stores as the metadata for the file. This can help recover lost databases. """ bucket = self.s3_bucket objects = bucket.objects.filter(Prefix=self.sub_dir) for obj in objects: key_ = self._get_full_key_path(obj.key) data = self.s3_bucket.Object(key_).get()["Body"].read() if self.compress: data = self._get_decompression_function()(data) unpacked_data = msgpack.unpackb(data, raw=False) self.update(unpacked_data, **kwargs) def rebuild_metadata_from_index(self, index_query: Optional[dict] = None): """ Read data from the index store and populate the metadata of the S3 bucket. Force all the keys to be lower case to be Minio compatible. Args: index_query: query on the index store. """ qq = {} if index_query is None else index_query for index_doc in self.index.query(qq): key_ = self._get_full_key_path(index_doc[self.key]) s3_object = self.s3_bucket.Object(key_) new_meta = {self._sanitize_key(k): v for k, v in s3_object.metadata.items()} for k, v in index_doc.items(): new_meta[str(k).lower()] = v new_meta.pop("_id") if self.last_updated_field in new_meta: new_meta[self.last_updated_field] = str(to_isoformat_ceil_ms(new_meta[self.last_updated_field])) # s3_object.metadata.update(new_meta) s3_object.copy_from( CopySource={"Bucket": self.s3_bucket.name, "Key": key_}, Metadata=new_meta, MetadataDirective="REPLACE", ) def __eq__(self, other: object) -> bool: """ Check equality for S3Store. other: other S3Store to compare with. """ if not isinstance(other, S3Store): return False fields = ["index", "bucket", "last_updated_field"] return all(getattr(self, f) == getattr(other, f) for f in fields) maggma-0.70.0/src/maggma/stores/azure.py000066400000000000000000000516751470132070100200770ustar00rootroot00000000000000""" Advanced Stores for connecting to Microsoft Azure data. """ import os import threading import warnings import zlib from collections.abc import Iterator from concurrent.futures import wait from concurrent.futures.thread import ThreadPoolExecutor from hashlib import sha1 from json import dumps from typing import Optional, Union import msgpack # type: ignore from monty.msgpack import default as monty_default from maggma.core import Sort, Store from maggma.utils import grouper, to_isoformat_ceil_ms try: import azure import azure.storage.blob as azure_blob from azure.core.exceptions import ResourceExistsError from azure.identity import DefaultAzureCredential from azure.storage.blob import BlobServiceClient, ContainerClient except (ImportError, ModuleNotFoundError): azure_blob = None # type: ignore ContainerClient = None AZURE_KEY_SANITIZE = {"-": "_", ".": "_"} class AzureBlobStore(Store): """ GridFS like storage using Azure Blob and a regular store for indexing. Requires azure-storage-blob and azure-identity modules to be installed. """ def __init__( self, index: Store, container_name: str, azure_client_info: Optional[Union[str, dict]] = None, compress: bool = False, sub_dir: Optional[str] = None, workers: int = 1, azure_resource_kwargs: Optional[dict] = None, key: str = "fs_id", store_hash: bool = True, unpack_data: bool = True, searchable_fields: Optional[list[str]] = None, key_sanitize_dict: Optional[dict] = None, create_container: bool = False, **kwargs, ): """ Initializes an AzureBlob Store. Args: index: a store to use to index the Azure blob container_name: name of the container azure_client_info: connection_url of the BlobServiceClient if a string. Assumes that the access is passwordless in that case. Otherwise, if a dictionary, options to instantiate the BlobServiceClient. Currently supported keywords: - connection_string: a connection string for the Azure blob compress: compress files inserted into the store sub_dir: (optional) subdirectory of the container to store the data. When defined, a final "/" will be added if not already present. workers: number of concurrent Azure puts to run store_hash: store the sha1 hash right before insertion to the database. unpack_data: whether to decompress and unpack byte data when querying from the container. searchable_fields: fields to keep in the index store key_sanitize_dict: a dictionary that allows to customize the sanitization of the keys in metadata, since they should adhere to the naming rules for C# identifiers. If None the AZURE_KEY_SANITIZE default will be used to handle the most common cases. create_container: if True the Store creates the container, in case it does not exist. kwargs: keywords for the base Store. """ if azure_blob is None: raise RuntimeError("azure-storage-blob and azure-identity are required for AzureBlobStore") self.index = index self.container_name = container_name self.azure_client_info = azure_client_info self.compress = compress self.sub_dir = sub_dir.rstrip("/") + "/" if sub_dir else "" self.service: Optional[BlobServiceClient] = None self.container: Optional[ContainerClient] = None self.workers = workers self.azure_resource_kwargs = azure_resource_kwargs if azure_resource_kwargs is not None else {} self.unpack_data = unpack_data self.searchable_fields = searchable_fields if searchable_fields is not None else [] self.store_hash = store_hash if key_sanitize_dict is None: key_sanitize_dict = AZURE_KEY_SANITIZE self.key_sanitize_dict = key_sanitize_dict self.create_container = create_container # Force the key to be the same as the index assert isinstance( index.key, str ), "Since we are using the key as a file name in Azure Blob, the key must be a string" if key != index.key: warnings.warn( f'The desired AzureBlobStore key "{key}" does not match the index key "{index.key},"' "the index key will be used", UserWarning, ) kwargs["key"] = str(index.key) self._thread_local = threading.local() super().__init__(**kwargs) @property def name(self) -> str: """ Returns: a string representing this data source. """ return f"container://{self.container_name}" def connect(self, *args, **kwargs): # lgtm[py/conflicting-attributes] """ Connect to the source data. """ service_client = self._get_service_client() if not self.service: self.service = service_client container = service_client.get_container_client(self.container_name) if not container.exists(): if self.create_container: # catch the exception to avoid errors if already created try: container.create_container() except ResourceExistsError: pass else: raise RuntimeError(f"Container not present on Azure: {self.container_name}") self.container = container self.index.connect(*args, **kwargs) def close(self): """ Closes any connections. """ self.index.close() self.service = None self.container = None @property def _collection(self): """ Returns: a handle to the pymongo collection object. Important: Not guaranteed to exist in the future """ # For now returns the index collection since that is what we would "search" on return self.index._collection def count(self, criteria: Optional[dict] = None) -> int: """ Counts the number of documents matching the query criteria. Args: criteria: PyMongo filter for documents to count in """ return self.index.count(criteria) def query( self, criteria: Optional[dict] = None, properties: Union[dict, list, None] = None, sort: Optional[dict[str, Union[Sort, int]]] = None, skip: int = 0, limit: int = 0, ) -> Iterator[dict]: """ Queries the Store for a set of documents. Args: criteria: PyMongo filter for documents to search in properties: properties to return in grouped documents sort: Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending. skip: number documents to skip limit: limit on total number of documents returned """ if self.container is None or self.service is None: raise RuntimeError("The store has not been connected") prop_keys = set() if isinstance(properties, dict): prop_keys = set(properties.keys()) elif isinstance(properties, list): prop_keys = set(properties) for doc in self.index.query(criteria=criteria, sort=sort, limit=limit, skip=skip): if properties is not None and prop_keys.issubset(set(doc.keys())): yield {p: doc[p] for p in properties if p in doc} else: try: data = self.container.download_blob(self.sub_dir + str(doc[self.key])).readall() except azure.core.exceptions.ResourceNotFoundError: self.logger.error(f"Could not find Blob object {doc[self.key]}") if self.unpack_data: data = self._unpack(data=data, compressed=doc.get("compression", "") == "zlib") if self.last_updated_field in doc: data[self.last_updated_field] = doc[self.last_updated_field] # type: ignore yield data # type: ignore @staticmethod def _unpack(data: bytes, compressed: bool): if compressed: data = zlib.decompress(data) # requires msgpack-python to be installed to fix string encoding problem # https://github.com/msgpack/msgpack/issues/121 # During recursion # msgpack.unpackb goes as deep as possible during reconstruction # MontyDecoder().process_decode only goes until it finds a from_dict # as such, we cannot just use msgpack.unpackb(data, object_hook=monty_object_hook, raw=False) # Should just return the unpacked object then let the user run process_decoded return msgpack.unpackb(data, raw=False) def distinct(self, field: str, criteria: Optional[dict] = None, all_exist: bool = False) -> list: """ Get all distinct values for a field. Args: field: the field(s) to get distinct values for criteria: PyMongo filter for documents to search in """ # Index is a store so it should have its own distinct function return self.index.distinct(field, criteria=criteria) def groupby( self, keys: Union[list[str], str], criteria: Optional[dict] = None, properties: Union[dict, list, None] = None, sort: Optional[dict[str, Union[Sort, int]]] = None, skip: int = 0, limit: int = 0, ) -> Iterator[tuple[dict, list[dict]]]: """ Simple grouping function that will group documents by keys. Args: keys: fields to group documents criteria: PyMongo filter for documents to search in properties: properties to return in grouped documents sort: Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending. skip: number documents to skip limit: limit on total number of documents returned Returns: generator returning tuples of (dict, list of docs) """ return self.index.groupby( keys=keys, criteria=criteria, properties=properties, sort=sort, skip=skip, limit=limit, ) def ensure_index(self, key: str, unique: bool = False) -> bool: """ Tries to create an index and return true if it succeeded. Args: key: single key to index unique: Whether or not this index contains only unique keys Returns: bool indicating if the index exists/was created """ return self.index.ensure_index(key, unique=unique) def update( self, docs: Union[list[dict], dict], key: Union[list, str, None] = None, additional_metadata: Union[str, list[str], None] = None, ): """ Update documents into the Store. Args: docs: the document or list of documents to update key: field name(s) to determine uniqueness for a document, can be a list of multiple fields, a single field, or None if the Store's key field is to be used additional_metadata: field(s) to include in the blob store's metadata """ if self.container is None or self.service is None: raise RuntimeError("The store has not been connected") if not isinstance(docs, list): docs = [docs] if isinstance(key, str): key = [key] elif not key: key = [self.key] if additional_metadata is None: additional_metadata = [] elif isinstance(additional_metadata, str): additional_metadata = [additional_metadata] else: additional_metadata = list(additional_metadata) with ThreadPoolExecutor(max_workers=self.workers) as pool: fs = { pool.submit( self.write_doc_to_blob, doc=itr_doc, search_keys=key + additional_metadata + self.searchable_fields, ) for itr_doc in docs } fs, _ = wait(fs) search_docs = [sdoc.result() for sdoc in fs] # Use store's update to remove key clashes self.index.update(search_docs, key=self.key) def _get_service_client(self): if not hasattr(self._thread_local, "container"): if isinstance(self.azure_client_info, str): # assume it is the account_url and that the connection is passwordless default_credential = DefaultAzureCredential() return BlobServiceClient(self.azure_client_info, credential=default_credential) if isinstance(self.azure_client_info, dict): connection_string = self.azure_client_info.get("connection_string") if connection_string: return BlobServiceClient.from_connection_string(conn_str=connection_string) msg = f"Could not instantiate BlobServiceClient from azure_client_info: {self.azure_client_info}" raise RuntimeError(msg) return None def _get_container(self) -> Optional[ContainerClient]: """ If on the main thread return the container created above, else create a new container on each thread. """ if threading.current_thread().name == "MainThread": return self.container if not hasattr(self._thread_local, "container"): service_client = self._get_service_client() container = service_client.get_container_client(self.container_name) self._thread_local.container = container return self._thread_local.container def write_doc_to_blob(self, doc: dict, search_keys: list[str]): """ Write the data to an Azure blob and return the metadata to be inserted into the index db. Args: doc: the document search_keys: list of keys to pull from the docs and be inserted into the index db """ container = self._get_container() if container is None: raise RuntimeError("The store has not been connected") search_doc = {k: doc[k] for k in search_keys} search_doc[self.key] = doc[self.key] # Ensure key is in metadata if self.sub_dir != "": search_doc["sub_dir"] = self.sub_dir # Remove MongoDB _id from search if "_id" in search_doc: del search_doc["_id"] # to make hashing more meaningful, make sure last updated field is removed lu_info = doc.pop(self.last_updated_field, None) data = msgpack.packb(doc, default=monty_default) if self.compress: # Compress with zlib if chosen search_doc["compression"] = "zlib" data = zlib.compress(data) if self.last_updated_field in doc: # need this conversion for metadata insert search_doc[self.last_updated_field] = str(to_isoformat_ceil_ms(doc[self.last_updated_field])) # keep a record of original keys, in case these are important for the individual researcher # it is not expected that this information will be used except in disaster recovery blob_to_mongo_keys = {k: self._sanitize_key(k) for k in search_doc} blob_to_mongo_keys["blob_to_mongo_keys"] = "blob_to_mongo_keys" # inception # encode dictionary since values have to be strings search_doc["blob_to_mongo_keys"] = dumps(blob_to_mongo_keys) container.upload_blob( name=self.sub_dir + str(doc[self.key]), data=data, metadata={blob_to_mongo_keys[k]: str(v) for k, v in search_doc.items()}, overwrite=True, ) if lu_info is not None: search_doc[self.last_updated_field] = lu_info if self.store_hash: hasher = sha1() hasher.update(data) obj_hash = hasher.hexdigest() search_doc["obj_hash"] = obj_hash return search_doc def _sanitize_key(self, key): """ Sanitize keys to store metadata. The metadata keys should adhere to the naming rules for C# identifiers. """ new_key = str(key) for k, v in self.key_sanitize_dict.items(): new_key = new_key.replace(k, v) return new_key def remove_docs(self, criteria: dict, remove_blob_object: bool = False): """ Remove docs matching the query dictionary. Args: criteria: query dictionary to match remove_blob_object: whether to remove the actual blob Object or not """ if self.container is None or self.service is None: raise RuntimeError("The store has not been connected") if not remove_blob_object: self.index.remove_docs(criteria=criteria) else: to_remove = self.index.distinct(self.key, criteria=criteria) self.index.remove_docs(criteria=criteria) # Can remove up to 256 items at a time to_remove_chunks = list(grouper(to_remove, n=256)) for chunk_to_remove in to_remove_chunks: objlist = [{"name": f"{self.sub_dir}{obj}"} for obj in chunk_to_remove] self.container.delete_blobs(*objlist) @property def last_updated(self): return self.index.last_updated def newer_in(self, target: Store, criteria: Optional[dict] = None, exhaustive: bool = False) -> list[str]: """ Returns the keys of documents that are newer in the target Store than this Store. Args: target: target Store criteria: PyMongo filter for documents to search in exhaustive: triggers an item-by-item check vs. checking the last_updated of the target Store and using that to filter out new items in """ if hasattr(target, "index"): return self.index.newer_in(target=target.index, criteria=criteria, exhaustive=exhaustive) return self.index.newer_in(target=target, criteria=criteria, exhaustive=exhaustive) def __hash__(self): return hash((self.index.__hash__, self.container_name)) def rebuild_index_from_blob_data(self, **kwargs): """ Rebuilds the index Store from the data in Azure Relies on the index document being stores as the metadata for the file This can help recover lost databases. """ objects = self.container.list_blobs(name_starts_with=self.sub_dir) for obj in objects: # handle the case where there are subdirs in the chosen container # but are below the level of the current subdir dir_name = os.path.dirname(obj.name) if dir_name != self.sub_dir: continue data = self.container.download_blob(obj.name).readall() if self.compress: data = zlib.decompress(data) unpacked_data = msgpack.unpackb(data, raw=False) # TODO maybe it can be avoided to reupload the data, since it is paid self.update(unpacked_data, **kwargs) def rebuild_metadata_from_index(self, index_query: Optional[dict] = None): """ Read data from the index store and populate the metadata of the Azure Blob. Force all of the keys to be lower case to be Minio compatible Args: index_query: query on the index store. """ if self.container is None or self.service is None: raise RuntimeError("The store has not been connected") qq = {} if index_query is None else index_query for index_doc in self.index.query(qq): key_ = self.sub_dir + index_doc[self.key] blob = self.container.get_blob_client(key_) properties = blob.get_blob_properties() new_meta = {self._sanitize_key(k): v for k, v in properties.metadata.items()} for k, v in index_doc.items(): new_meta[str(k).lower()] = v new_meta.pop("_id") if self.last_updated_field in new_meta: new_meta[self.last_updated_field] = str(to_isoformat_ceil_ms(new_meta[self.last_updated_field])) blob.set_blob_metadata(new_meta) def __eq__(self, other: object) -> bool: """ Check equality for AzureBlobStore other: other AzureBlobStore to compare with. """ if not isinstance(other, AzureBlobStore): return False fields = ["index", "container_name", "last_updated_field"] return all(getattr(self, f) == getattr(other, f) for f in fields) maggma-0.70.0/src/maggma/stores/compound_stores.py000066400000000000000000000417141470132070100221650ustar00rootroot00000000000000"""Special stores that combine underlying Stores together.""" from collections.abc import Iterator from datetime import datetime from itertools import groupby from typing import Optional, Union from pydash import set_ from pymongo import MongoClient from maggma.core import Sort, Store, StoreError from maggma.stores.mongolike import MongoStore class JointStore(Store): """ Store that implements a on-the-fly join across multiple collections all in the same MongoDB database. This is a Read-Only Store designed to combine data from multiple collections. """ def __init__( self, database: str, collection_names: list[str], host: str = "localhost", port: int = 27017, username: str = "", password: str = "", main: Optional[str] = None, merge_at_root: bool = False, mongoclient_kwargs: Optional[dict] = None, **kwargs, ): """ Args: database: The database name collection_names: list of all collections to join host: Hostname for the database port: TCP port to connect to username: Username for the collection password: Password to connect with main: name for the main collection if not specified this defaults to the first in collection_names list. """ self.database = database self.collection_names = collection_names self.host = host self.port = port self.username = username self.password = password self._coll = None # type: Any self.main = main or collection_names[0] self.merge_at_root = merge_at_root self.mongoclient_kwargs = mongoclient_kwargs or {} self.kwargs = kwargs super().__init__(**kwargs) @property def name(self) -> str: """ Return a string representing this data source. """ compound_name = ",".join(self.collection_names) return f"Compound[{self.host}/{self.database}][{compound_name}]" def connect(self, force_reset: bool = False): """ Connects the underlying Mongo database and all collection connections. Args: force_reset: whether to reset the connection or not when the Store is already connected. """ if not self._coll or force_reset: conn: MongoClient = ( MongoClient( host=self.host, port=self.port, username=self.username, password=self.password, **self.mongoclient_kwargs, ) if self.username != "" else MongoClient(self.host, self.port, **self.mongoclient_kwargs) ) db = conn[self.database] self._coll = db[self.main] self._has_merge_objects = self._collection.database.client.server_info()["version"] > "3.6" def close(self): """ Closes underlying database connections. """ self._collection.database.client.close() @property def _collection(self): """Property referring to the root pymongo collection.""" if self._coll is None: raise StoreError("Must connect Mongo-like store before attempting to use it") return self._coll @property def nonmain_names(self) -> list: """ all non-main collection names. """ return list(set(self.collection_names) - {self.main}) @property def last_updated(self) -> datetime: """ Special last_updated for this JointStore that checks all underlying collections. """ lus = [] for cname in self.collection_names: store = MongoStore.from_collection(self._collection.database[cname]) store.last_updated_field = self.last_updated_field lu = store.last_updated lus.append(lu) return max(lus) # TODO: implement update? def update(self, docs, update_lu=True, key=None, **kwargs): """ Update documents into the underlying collections Not Implemented for JointStore. """ raise NotImplementedError("JointStore is a read-only store") def _get_store_by_name(self, name) -> MongoStore: """ Gets an underlying collection as a mongoStore. """ if name not in self.collection_names: raise ValueError("Asking for collection not referenced in this Store") return MongoStore.from_collection(self._collection.database[name]) def ensure_index(self, key, unique=False, **kwargs): """ Can't ensure index for JointStore. """ raise NotImplementedError("No ensure_index method for JointStore") def _get_pipeline(self, criteria=None, properties=None, skip=0, limit=0): """ Gets the aggregation pipeline for query and query_one. Args: properties: properties to be returned criteria: criteria to filter by skip: docs to skip limit: limit results to N docs Returns: list of aggregation operators """ pipeline = [] collection_names = list(set(self.collection_names) - set(self.main)) for cname in collection_names: pipeline.append( { "$lookup": { "from": cname, "localField": self.key, "foreignField": self.key, "as": cname, } } ) if self.merge_at_root: if not self._has_merge_objects: raise Exception("MongoDB server version too low to use $mergeObjects.") pipeline.append( { "$replaceRoot": { "newRoot": { "$mergeObjects": [ {"$arrayElemAt": [f"${cname}", 0]}, "$$ROOT", ] } } } ) else: pipeline.append( { "$unwind": { "path": f"${cname}", "preserveNullAndEmptyArrays": True, } } ) # Do projection for max last_updated lu_max_fields = [f"${self.last_updated_field}"] lu_max_fields.extend([f"${cname}.{self.last_updated_field}" for cname in self.collection_names]) lu_proj = {self.last_updated_field: {"$max": lu_max_fields}} pipeline.append({"$addFields": lu_proj}) if criteria: pipeline.append({"$match": criteria}) if isinstance(properties, list): properties = {k: 1 for k in properties} if properties: pipeline.append({"$project": properties}) if skip > 0: pipeline.append({"$skip": skip}) if limit > 0: pipeline.append({"$limit": limit}) return pipeline def count(self, criteria: Optional[dict] = None) -> int: """ Counts the number of documents matching the query criteria. Args: criteria: PyMongo filter for documents to count in """ pipeline = self._get_pipeline(criteria=criteria) pipeline.append({"$count": "count"}) agg = list(self._collection.aggregate(pipeline)) return agg[0].get("count", 0) if len(agg) > 0 else 0 def query( self, criteria: Optional[dict] = None, properties: Union[dict, list, None] = None, sort: Optional[dict[str, Union[Sort, int]]] = None, skip: int = 0, limit: int = 0, ) -> Iterator[dict]: pipeline = self._get_pipeline(criteria=criteria, properties=properties, skip=skip, limit=limit) agg = self._collection.aggregate(pipeline) yield from agg def groupby( self, keys: Union[list[str], str], criteria: Optional[dict] = None, properties: Union[dict, list, None] = None, sort: Optional[dict[str, Union[Sort, int]]] = None, skip: int = 0, limit: int = 0, ) -> Iterator[tuple[dict, list[dict]]]: pipeline = self._get_pipeline(criteria=criteria, properties=properties, skip=skip, limit=limit) if not isinstance(keys, list): keys = [keys] group_id = {} # type: Dict[str,Any] for key in keys: set_(group_id, key, f"${key}") pipeline.append({"$group": {"_id": group_id, "docs": {"$push": "$$ROOT"}}}) agg = self._collection.aggregate(pipeline) for d in agg: yield d["_id"], d["docs"] def query_one(self, criteria=None, properties=None, **kwargs): """ Get one document. Args: properties: properties to return in query criteria: filter for matching kwargs: kwargs for collection.aggregate Returns: single document """ # TODO: maybe adding explicit limit in agg pipeline is better as below? # pipeline = self._get_pipeline(properties, criteria) # pipeline.append({"$limit": 1}) query = self.query(criteria=criteria, properties=properties, **kwargs) try: return next(query) except StopIteration: return None def remove_docs(self, criteria: dict): """ Remove docs matching the query dictionary. Args: criteria: query dictionary to match """ raise NotImplementedError("No remove_docs method for JointStore") def __eq__(self, other: object) -> bool: """ Check equality for JointStore Args: other: other JointStore to compare with. """ if not isinstance(other, JointStore): return False fields = [ "database", "collection_names", "host", "port", "main", "merge_at_root", ] return all(getattr(self, f) == getattr(other, f) for f in fields) class ConcatStore(Store): """Store concatting multiple stores.""" def __init__(self, stores: list[Store], **kwargs): """ Initialize a ConcatStore that concatenates multiple stores together to appear as one store. Args: stores: list of stores to concatenate together """ self.stores = stores self.kwargs = kwargs super().__init__(**kwargs) @property def name(self) -> str: """ A string representing this data source. """ compound_name = ",".join([store.name for store in self.stores]) return f"Concat[{compound_name}]" def connect(self, force_reset: bool = False): """ Connect all stores in this ConcatStore. Args: force_reset: Whether to forcibly reset the connection for all stores """ for store in self.stores: store.connect(force_reset) def close(self): """ Close all connections in this ConcatStore. """ for store in self.stores: store.close() @property def _collection(self): raise NotImplementedError("No collection property for ConcatStore") @property def last_updated(self) -> datetime: """ Finds the most recent last_updated across all the stores. This might not be the most useful way to do this for this type of Store since it could very easily over-estimate the last_updated based on what stores are used. """ lus = [] for store in self.stores: lu = store.last_updated lus.append(lu) return max(lus) def update(self, docs: Union[list[dict], dict], key: Union[list, str, None] = None): """ Update documents into the Store Not implemented in ConcatStore. Args: docs: the document or list of documents to update key: field name(s) to determine uniqueness for a document, can be a list of multiple fields, a single field, or None if the Store's key field is to be used """ raise NotImplementedError("No update method for ConcatStore") def distinct(self, field: str, criteria: Optional[dict] = None, all_exist: bool = False) -> list: """ Get all distinct values for a field. Args: field: the field(s) to get distinct values for criteria: PyMongo filter for documents to search in """ distincts = [] for store in self.stores: distincts.extend(store.distinct(field=field, criteria=criteria)) return list(set(distincts)) def ensure_index(self, key: str, unique: bool = False) -> bool: """ Ensure an index is properly set. Returns whether all stores support this index or not. Args: key: single key to index unique: Whether or not this index contains only unique keys Returns: bool indicating if the index exists/was created on all stores """ return all(store.ensure_index(key, unique) for store in self.stores) def count(self, criteria: Optional[dict] = None) -> int: """ Counts the number of documents matching the query criteria. Args: criteria: PyMongo filter for documents to count in """ counts = [store.count(criteria) for store in self.stores] return sum(counts) def query( self, criteria: Optional[dict] = None, properties: Union[dict, list, None] = None, sort: Optional[dict[str, Union[Sort, int]]] = None, skip: int = 0, limit: int = 0, ) -> Iterator[dict]: """ Queries across all Store for a set of documents. Args: criteria: PyMongo filter for documents to search in properties: properties to return in grouped documents sort: Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending. skip: number documents to skip limit: limit on total number of documents returned """ # TODO: skip, sort and limit are broken. implement properly for store in self.stores: yield from store.query(criteria=criteria, properties=properties) def groupby( self, keys: Union[list[str], str], criteria: Optional[dict] = None, properties: Union[dict, list, None] = None, sort: Optional[dict[str, Union[Sort, int]]] = None, skip: int = 0, limit: int = 0, ) -> Iterator[tuple[dict, list[dict]]]: """ Simple grouping function that will group documents by keys. Args: keys: fields to group documents criteria: PyMongo filter for documents to search in properties: properties to return in grouped documents sort: Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending. skip: number documents to skip limit: limit on total number of documents returned Returns: generator returning tuples of (dict, list of docs) """ if isinstance(keys, str): keys = [keys] docs = [] for store in self.stores: temp_docs = list( store.groupby( keys=keys, criteria=criteria, properties=properties, sort=sort, skip=skip, limit=limit, ) ) for _key, group in temp_docs: docs.extend(group) def key_set(d: dict) -> tuple: """Index function based on passed in keys.""" return tuple(d.get(k) for k in keys) sorted_docs = sorted(docs, key=key_set) for vals, group_iter in groupby(sorted_docs, key=key_set): id_dict = dict(zip(keys, vals)) yield id_dict, list(group_iter) def remove_docs(self, criteria: dict): """ Remove docs matching the query dictionary. Args: criteria: query dictionary to match """ raise NotImplementedError("No remove_docs method for JointStore") def __eq__(self, other: object) -> bool: """ Check equality for ConcatStore. Args: other: other JointStore to compare with """ if not isinstance(other, ConcatStore): return False fields = ["stores"] return all(getattr(self, f) == getattr(other, f) for f in fields) maggma-0.70.0/src/maggma/stores/file_store.py000066400000000000000000000530241470132070100210720ustar00rootroot00000000000000""" Module defining a FileStore that enables accessing files in a local directory using typical maggma access patterns. """ import fnmatch import hashlib import os import re import warnings from collections.abc import Iterator from datetime import datetime, timezone from pathlib import Path from typing import Callable, Optional, Union from monty.io import zopen from pymongo import UpdateOne from maggma.core import Sort, StoreError from maggma.stores.mongolike import JSONStore, MemoryStore # These keys are automatically populated by the FileStore.read() method and # hence are not allowed to be manually overwritten PROTECTED_KEYS = { "_id", "name", "path", "last_updated", "hash", "size", "parent", "orphan", "contents", } class FileStore(MemoryStore): """ A Store for files on disk. Provides a common access method consistent with other stores. Each Item in the Store represents one file. Files can be organized into any type of directory structure. A hash of the full path to each file is used to define a file_id that uniquely identifies each item. Any metadata added to the items is written to a .json file in the root directory of the FileStore. """ def __init__( self, path: Union[str, Path], file_filters: Optional[list] = None, max_depth: Optional[int] = None, read_only: bool = True, include_orphans: bool = False, json_name: str = "FileStore.json", encoding: Optional[str] = None, **kwargs, ): """ Initializes a FileStore. Args: path: parent directory containing all files and subdirectories to process file_filters: List of fnmatch patterns defining the files to be tracked by the FileStore. Only files that match one of the patterns provided will be included in the Store If None (default), all files are included. Examples: ["*.txt", "test-[abcd].txt"], etc. See https://docs.python.org/3/library/fnmatch.html for full syntax max_depth: The maximum depth to look into subdirectories. 0 = no recursion, 1 = include files 1 directory below the FileStore, etc. None (default) will scan all files below the FileStore root directory, regardless of depth. read_only: If True (default), the .update() and .remove_docs() methods are disabled, preventing any changes to the files on disk. In addition, metadata cannot be written to disk. include_orphans: Whether to include orphaned metadata records in query results. Orphaned metadata records are records found in the local JSON file that can no longer be associated to a file on disk. This can happen if a file is renamed or deleted, or if the FileStore is re-initialized with a more restrictive file_filters or max_depth argument. By default (False), these records do not appear in query results. Nevertheless, the metadata records are retained in the JSON file and the FileStore to prevent accidental data loss. json_name: Name of the .json file to which metadata is saved. If read_only is False, this file will be created in the root directory of the FileStore. encoding: Character encoding of files to be tracked by the store. The default (None) follows python's default behavior, which is to determine the character encoding from the platform. This should work in the great majority of cases. However, if you encounter a UnicodeDecodeError, consider setting the encoding explicitly to 'utf8' or another encoding as appropriate. kwargs: kwargs passed to MemoryStore.__init__() """ # this conditional block is needed in order to guarantee that the 'name' # property, which is passed to `MemoryStore`, works correctly # collection names passed to MemoryStore cannot end with '.' if path == ".": path = Path.cwd() self.path = Path(path) if isinstance(path, str) else path self.json_name = json_name file_filters = file_filters if file_filters else ["*"] self.file_filters = re.compile("|".join(fnmatch.translate(p) for p in file_filters)) self.collection_name = "file_store" self.key = "file_id" self.include_orphans = include_orphans self.read_only = read_only self.max_depth = max_depth self.encoding = encoding self.metadata_store = JSONStore( paths=[str(self.path / self.json_name)], read_only=self.read_only, collection_name=self.collection_name, key=self.key, ) self.kwargs = kwargs super().__init__( collection_name=self.collection_name, key=self.key, **self.kwargs, ) @property def name(self) -> str: """ Return a string representing this data source. """ return f"file://{self.path}" def add_metadata( self, metadata: Optional[dict] = None, query: Optional[dict] = None, auto_data: Optional[Callable[[dict], dict]] = None, **kwargs, ): """ Add metadata to a record in the FileStore, either manually or by computing it automatically from another field, such as name or path (see auto_data). Args: metadata: dict of additional data to add to the records returned by query. Note that any protected keys (such as 'name', 'path', etc.) will be ignored. query: Query passed to FileStore.query() auto_data: A function that automatically computes metadata based on a field in the record itself. The function must take in the item as a dict and return a dict containing the desired metadata. A typical use case is to assign metadata based on the name of a file. For example, for data files named like `2022-04-01_april_fool_experiment.txt`, the auto_data function could be: def get_metadata_from_filename(d): return {"date": d["name"].split("_")[0], "test_name": d["name"].split("_")[1] } Note that in the case of conflict between manual and automatically computed metadata (for example, if metadata={"name": "another_name"} was supplied alongside the auto_data function above), the manually-supplied metadata is used. kwargs: kwargs passed to FileStore.query() """ if metadata is None: metadata = {} # sanitize the metadata filtered_metadata = self._filter_data(metadata) updated_docs = [] for doc in self.query(query, **kwargs): if auto_data: extra_data = self._filter_data(auto_data(doc)) doc.update(extra_data) doc.update(filtered_metadata) updated_docs.append(doc) self.update(updated_docs, key=self.key) def read(self) -> list[dict]: """ Iterate through all files in the Store folder and populate the Store with dictionaries containing basic information about each file. The keys of the documents added to the Store are: - name: str = File name - path: Path = Absolute path of this file - parent: str = Name of the parent directory (if any) - file_id: str = Unique identifier for this file, computed from the hash of its path relative to the base FileStore directory and the file creation time. The key of this field is 'file_id' by default but can be changed via the 'key' kwarg to `FileStore.__init__()`. - size: int = Size of this file in bytes - last_updated: datetime = Time this file was last modified - hash: str = Hash of the file contents - orphan: bool = Whether this record is an orphan """ file_list = [] # generate a list of files in subdirectories for root, _dirs, files in os.walk(self.path): # for pattern in self.file_filters: for match in filter(self.file_filters.match, files): # for match in fnmatch.filter(files, pattern): path = Path(os.path.join(root, match)) # ignore the .json file created by the Store if path.is_file() and path.name != self.json_name: # filter based on depth depth = len(path.relative_to(self.path).parts) - 1 if self.max_depth is None or depth <= self.max_depth: file_list.append(self._create_record_from_file(path)) return file_list def _create_record_from_file(self, f: Path) -> dict: """ Given the path to a file, return a Dict that constitutes a record of basic information about that file. The keys in the returned dict are: - name: str = File name - path: Path = Absolute path of this file - parent: str = Name of the parent directory (if any) - file_id: str = Unique identifier for this file, computed from the hash of its path relative to the base FileStore directory and the file creation time. The key of this field is 'file_id' by default but can be changed via the 'key' kwarg to FileStore.__init__(). - size: int = Size of this file in bytes - last_updated: datetime = Time this file was last modified - hash: str = Hash of the file contents - orphan: bool = Whether this record is an orphan """ # compute the file_id from the relative path relative_path = f.relative_to(self.path) digest = hashlib.md5() digest.update(str(relative_path).encode()) file_id = str(digest.hexdigest()) # hash the file contents digest2 = hashlib.md5() b = bytearray(128 * 2056) mv = memoryview(b) digest2.update(self.name.encode()) with open(f.as_posix(), "rb", buffering=0) as file: # this block copied from the file_digest method in python 3.11+ # see https://github.com/python/cpython/blob/0ba07b2108d4763273f3fb85544dde34c5acd40a/Lib/hashlib.py#L213 if hasattr(file, "getbuffer"): # io.BytesIO object, use zero-copy buffer digest2.update(file.getbuffer()) else: for n in iter(lambda: file.readinto(mv), 0): digest2.update(mv[:n]) content_hash = str(digest2.hexdigest()) stats = f.stat() return { "name": f.name, "path": f, "path_relative": relative_path, "parent": f.parent.name, "size": stats.st_size, "last_updated": datetime.fromtimestamp(stats.st_mtime, tz=timezone.utc), "orphan": False, "hash": content_hash, self.key: file_id, } def connect(self, force_reset: bool = False): """ Connect to the source data. Read all the files in the directory, create corresponding File items in the internal MemoryStore. If there is a metadata .json file in the directory, read its contents into the MemoryStore Args: force_reset: whether to reset the connection or not when the Store is already connected. """ # read all files and place them in the MemoryStore # use super.update to bypass the read_only guard statement # because we want the file data to be populated in memory super().connect(force_reset=force_reset) super().update(self.read()) # now read any metadata from the .json file try: self.metadata_store.connect(force_reset=force_reset) metadata = list(self.metadata_store.query()) except FileNotFoundError: metadata = [] warnings.warn( f""" JSON file '{self.json_name}' not found. To create this file automatically, re-initialize the FileStore with read_only=False. """ ) # merge metadata with file data and check for orphaned metadata requests = [] found_orphans = False key = self.key file_ids = self.distinct(self.key) for d in metadata: search_doc = {k: d[k] for k in key} if isinstance(key, list) else {key: d[key]} if d[key] not in file_ids: found_orphans = True d.update({"orphan": True}) del d["_id"] requests.append(UpdateOne(search_doc, {"$set": d}, upsert=True)) if found_orphans: warnings.warn( f"Orphaned metadata was found in {self.json_name}. This metadata" "will be added to the store with {'orphan': True}" ) if len(requests) > 0: self._collection.bulk_write(requests, ordered=False) def update(self, docs: Union[list[dict], dict], key: Union[list, str, None] = None): """ Update items in the Store. Only possible if the store is not read only. Any new fields that are added will be written to the JSON file in the root directory of the FileStore. Note that certain fields that come from file metadata on disk are protected and cannot be updated with this method. This prevents the contents of the FileStore from becoming out of sync with the files on which it is based. The protected fields are keys in the dict returned by _create_record_from_file, e.g. 'name', 'parent', 'path', 'last_updated', 'hash', 'size', 'contents', and 'orphan'. The 'path_relative' and key fields are retained to make each document in the JSON file identifiable by manual inspection. Args: docs: the document or list of documents to update key: field name(s) to determine uniqueness for a document, can be a list of multiple fields, a single field, or None if the Store's key field is to be used """ if self.read_only: raise StoreError( "This Store is read-only. To enable file I/O, re-initialize the store with read_only=False." ) super().update(docs, key) data = list(self.query()) filtered_data = [] # remove fields that are populated by .read() for d in data: filtered_d = self._filter_data(d) # don't write records that contain only file_id if len(set(filtered_d.keys()).difference({"path_relative", self.key})) != 0: filtered_data.append(filtered_d) self.metadata_store.update(filtered_data, self.key) def _filter_data(self, d): """ Remove any protected keys from a dictionary. Args: d: Dictionary whose keys are to be filtered """ return {k: v for k, v in d.items() if k not in PROTECTED_KEYS.union({self.last_updated_field})} def query( # type: ignore self, criteria: Optional[dict] = None, properties: Union[dict, list, None] = None, sort: Optional[dict[str, Union[Sort, int]]] = None, hint: Optional[dict[str, Union[Sort, int]]] = None, skip: int = 0, limit: int = 0, contents_size_limit: Optional[int] = 0, ) -> Iterator[dict]: """ Queries the Store for a set of documents. Args: criteria: PyMongo filter for documents to search in properties: properties to return in grouped documents sort: Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending. hint: Dictionary of indexes to use as hints for query optimizer. Keys are field names and values are 1 for ascending or -1 for descending. skip: number documents to skip limit: limit on total number of documents returned contents_size_limit: Maximum file size in bytes for which to return contents. The FileStore will attempt to read the file and populate the 'contents' key with its content at query time, unless the file size is larger than this value. By default, reading content is disabled. Note that enabling content reading can substantially slow down the query operation, especially when there are large numbers of files. """ return_contents = False criteria = criteria if criteria else {} if criteria.get("orphan", None) is None and not self.include_orphans: criteria.update({"orphan": False}) if criteria.get("contents"): warnings.warn("'contents' is not a queryable field! Ignoring.") if isinstance(properties, list): properties = {p: 1 for p in properties} orig_properties = properties.copy() if properties else None if properties is None: # None means return all fields, including contents return_contents = True elif properties.get("contents"): return_contents = True # remove contents b/c it isn't stored in the MemoryStore properties.pop("contents") # add size and path to query so that file can be read properties.update({"size": 1}) properties.update({"path": 1}) for d in super().query( criteria=criteria, properties=properties, sort=sort, hint=hint, skip=skip, limit=limit, ): # add file contents to the returned documents, if appropriate if return_contents and not d.get("orphan"): if contents_size_limit is None or d["size"] <= contents_size_limit: # attempt to read the file contents and inject into the document # TODO - could add more logic for detecting different file types # and more nuanced exception handling try: with zopen(d["path"], "r", encoding=self.encoding) as f: data = f.read() except Exception as e: data = f"Unable to read: {e}" elif d["size"] > contents_size_limit: data = f"File exceeds size limit of {contents_size_limit} bytes" else: data = "Unable to read: Unknown error" d.update({"contents": data}) # remove size and path if not explicitly requested if orig_properties is not None and "size" not in orig_properties: d.pop("size") if orig_properties is not None and "path" not in orig_properties: d.pop("path") yield d def query_one( self, criteria: Optional[dict] = None, properties: Union[dict, list, None] = None, sort: Optional[dict[str, Union[Sort, int]]] = None, contents_size_limit: Optional[int] = None, ): """ Queries the Store for a single document. Args: criteria: PyMongo filter for documents to search properties: properties to return in the document sort: Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending. contents_size_limit: Maximum file size in bytes for which to return contents. The FileStore will attempt to read the file and populate the 'contents' key with its content at query time, unless the file size is larger than this value. """ return next( self.query( criteria=criteria, properties=properties, sort=sort, contents_size_limit=contents_size_limit, ), None, ) def remove_docs(self, criteria: dict, confirm: bool = False): """ Remove items matching the query dictionary. Args: criteria: query dictionary to match confirm: Boolean flag to confirm that remove_docs should delete files on disk. Default: False. """ if self.read_only: raise StoreError( "This Store is read-only. To enable file I/O, re-initialize the store with read_only=False." ) docs = list(self.query(criteria)) # this ensures that any modifications to criteria made by self.query # (e.g., related to orphans or contents) are propagated through to the superclass new_criteria = {"file_id": {"$in": [d["file_id"] for d in docs]}} if len(docs) > 0 and not confirm: raise StoreError( f"Warning! This command is about to delete {len(docs)} items from disk! " "If this is what you want, reissue this command with confirm=True." ) for d in docs: Path(d["path"]).unlink() super().remove_docs(criteria=new_criteria) maggma-0.70.0/src/maggma/stores/gridfs.py000066400000000000000000000434471470132070100202250ustar00rootroot00000000000000""" Module containing various definitions of Stores. Stores are a default access pattern to data and provide various utilities. """ import copy import json import zlib from collections.abc import Iterator from datetime import datetime from typing import Any, Optional, Union import gridfs from monty.json import jsanitize from pydash import get, has from pymongo import MongoClient, uri_parser from pymongo.errors import ConfigurationError from ruamel.yaml import YAML from maggma.core import Sort, Store, StoreError from maggma.stores.mongolike import MongoStore from maggma.stores.ssh_tunnel import SSHTunnel # https://github.com/mongodb/specifications/ # blob/master/source/gridfs/gridfs-spec.rst#terms # (Under "Files collection document") files_collection_fields = ( "_id", "length", "chunkSize", "uploadDate", "md5", "filename", "contentType", "aliases", "metadata", ) class GridFSStore(Store): """ A Store for GridFS backend. Provides a common access method consistent with other stores. """ def __init__( self, database: str, collection_name: str, host: str = "localhost", port: int = 27017, username: str = "", password: str = "", compression: bool = False, ensure_metadata: bool = False, searchable_fields: Optional[list[str]] = None, auth_source: Optional[str] = None, mongoclient_kwargs: Optional[dict] = None, ssh_tunnel: Optional[SSHTunnel] = None, **kwargs, ): """ Initializes a GridFS Store for binary data Args: database: database name collection_name: The name of the collection. This is the string portion before the GridFS extensions host: hostname for the database port: port to connect to username: username to connect as password: password to authenticate as compression: compress the data as it goes into GridFS ensure_metadata: ensure returned documents have the metadata fields searchable_fields: fields to keep in the index store auth_source: The database to authenticate on. Defaults to the database name. ssh_tunnel: An SSHTunnel object to use. """ self.database = database self.collection_name = collection_name self.host = host self.port = port self.username = username self.password = password self._coll: Any = None self.compression = compression self.ensure_metadata = ensure_metadata self.searchable_fields = [] if searchable_fields is None else searchable_fields self.kwargs = kwargs self.ssh_tunnel = ssh_tunnel if auth_source is None: auth_source = self.database self.auth_source = auth_source self.mongoclient_kwargs = mongoclient_kwargs or {} if "key" not in kwargs: kwargs["key"] = "_id" super().__init__(**kwargs) @classmethod def from_launchpad_file(cls, lp_file, collection_name, **kwargs): """ Convenience method to construct a GridFSStore from a launchpad file. Note: A launchpad file is a special formatted yaml file used in fireworks Returns: """ with open(lp_file) as f: yaml = YAML(typ="safe", pure=True) lp_creds = yaml.load(f.read()) db_creds = lp_creds.copy() db_creds["database"] = db_creds["name"] for key in list(db_creds.keys()): if key not in ["database", "host", "port", "username", "password"]: db_creds.pop(key) db_creds["collection_name"] = collection_name return cls(**db_creds, **kwargs) @property def name(self) -> str: """ Return a string representing this data source. """ return f"gridfs://{self.host}/{self.database}/{self.collection_name}" def connect(self, force_reset: bool = False): """ Connect to the source data. Args: force_reset: whether to reset the connection or not when the Store is already connected. """ if not self._coll or force_reset: if self.ssh_tunnel is None: host = self.host port = self.port else: self.ssh_tunnel.start() host, port = self.ssh_tunnel.local_address conn: MongoClient = ( MongoClient( host=host, port=port, username=self.username, password=self.password, authSource=self.auth_source, **self.mongoclient_kwargs, ) if self.username != "" else MongoClient(host, port, **self.mongoclient_kwargs) ) db = conn[self.database] self._coll = gridfs.GridFS(db, self.collection_name) self._files_collection = db[f"{self.collection_name}.files"] self._files_store = MongoStore.from_collection(self._files_collection) self._files_store.last_updated_field = f"metadata.{self.last_updated_field}" self._files_store.key = self.key self._chunks_collection = db[f"{self.collection_name}.chunks"] @property def _collection(self): """Property referring to underlying pymongo collection.""" if self._coll is None: raise StoreError("Must connect Mongo-like store before attempting to use it") return self._coll @property def last_updated(self) -> datetime: """ Provides the most recent last_updated date time stamp from the documents in this Store. """ return self._files_store.last_updated @classmethod def transform_criteria(cls, criteria: dict) -> dict: """ Allow client to not need to prepend 'metadata.' to query fields. Args: criteria: Query criteria """ new_criteria = dict() for field in criteria: if field not in files_collection_fields and not field.startswith("metadata."): new_criteria["metadata." + field] = copy.copy(criteria[field]) else: new_criteria[field] = copy.copy(criteria[field]) return new_criteria def count(self, criteria: Optional[dict] = None) -> int: """ Counts the number of documents matching the query criteria. Args: criteria: PyMongo filter for documents to count in """ if isinstance(criteria, dict): criteria = self.transform_criteria(criteria) return self._files_store.count(criteria) def query( self, criteria: Optional[dict] = None, properties: Union[dict, list, None] = None, sort: Optional[dict[str, Union[Sort, int]]] = None, skip: int = 0, limit: int = 0, ) -> Iterator[dict]: """ Queries the GridFS Store for a set of documents. Will check to see if data can be returned from files store first. If the data from the gridfs is not a json serialized string a dict will be returned with the data in the "data" key plus the self.key and self.last_updated_field. Args: criteria: PyMongo filter for documents to search in properties: properties to return in grouped documents sort: Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending. skip: number documents to skip limit: limit on total number of documents returned """ if isinstance(criteria, dict): criteria = self.transform_criteria(criteria) elif criteria is not None: raise ValueError("Criteria must be a dictionary or None") prop_keys = set() if isinstance(properties, dict): prop_keys = set(properties.keys()) elif isinstance(properties, list): prop_keys = set(properties) for doc in self._files_store.query(criteria=criteria, sort=sort, limit=limit, skip=skip): if properties is not None and prop_keys.issubset(set(doc.keys())): yield {p: doc[p] for p in properties if p in doc} else: metadata = doc.get("metadata", {}) data = self._collection.find_one( filter={"_id": doc["_id"]}, skip=skip, limit=limit, sort=sort, ).read() if metadata.get("compression", "") == "zlib": data = zlib.decompress(data).decode("UTF-8") try: data = json.loads(data) except Exception: if not isinstance(data, dict): data = { "data": data, self.key: doc.get(self.key), self.last_updated_field: doc.get(self.last_updated_field), } if self.ensure_metadata and isinstance(data, dict): data.update(metadata) yield data def distinct(self, field: str, criteria: Optional[dict] = None, all_exist: bool = False) -> list: """ Get all distinct values for a field. This function only operates on the metadata in the files collection. Args: field: the field(s) to get distinct values for criteria: PyMongo filter for documents to search in """ criteria = self.transform_criteria(criteria) if isinstance(criteria, dict) else criteria field = ( f"metadata.{field}" if field not in files_collection_fields and not field.startswith("metadata.") else field ) return self._files_store.distinct(field=field, criteria=criteria) def groupby( self, keys: Union[list[str], str], criteria: Optional[dict] = None, properties: Union[dict, list, None] = None, sort: Optional[dict[str, Union[Sort, int]]] = None, skip: int = 0, limit: int = 0, ) -> Iterator[tuple[dict, list[dict]]]: """ Simple grouping function that will group documents by keys. Will only work if the keys are included in the files collection for GridFS. Args: keys: fields to group documents criteria: PyMongo filter for documents to search in properties: properties to return in grouped documents sort: Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending. skip: number documents to skip limit: limit on total number of documents returned Returns: generator returning tuples of (dict, list of docs) """ criteria = self.transform_criteria(criteria) if isinstance(criteria, dict) else criteria keys = [keys] if not isinstance(keys, list) else keys keys = [ f"metadata.{k}" if k not in files_collection_fields and not k.startswith("metadata.") else k for k in keys ] for group, ids in self._files_store.groupby(keys, criteria=criteria, properties=[f"metadata.{self.key}"]): ids = [get(doc, f"metadata.{self.key}") for doc in ids if has(doc, f"metadata.{self.key}")] group = {k.replace("metadata.", ""): get(group, k) for k in keys if has(group, k)} yield group, list(self.query(criteria={self.key: {"$in": ids}})) def ensure_index(self, key: str, unique: Optional[bool] = False) -> bool: """ Tries to create an index and return true if it succeeded Currently operators on the GridFS files collection Args: key: single key to index unique: Whether or not this index contains only unique keys. Returns: bool indicating if the index exists/was created """ # Transform key for gridfs first if key not in files_collection_fields: files_col_key = f"metadata.{key}" return self._files_store.ensure_index(files_col_key, unique=unique) return self._files_store.ensure_index(key, unique=unique) def update( self, docs: Union[list[dict], dict], key: Union[list, str, None] = None, additional_metadata: Union[str, list[str], None] = None, ): """ Update documents into the Store. Args: docs: the document or list of documents to update key: field name(s) to determine uniqueness for a document, can be a list of multiple fields, a single field, or None if the Store's key field is to be used additional_metadata: field(s) to include in the gridfs metadata """ if not isinstance(docs, list): docs = [docs] if isinstance(key, str): key = [key] elif not key: key = [self.key] key = list(set(key) - set(files_collection_fields)) if additional_metadata is None: additional_metadata = [] elif isinstance(additional_metadata, str): additional_metadata = [additional_metadata] else: additional_metadata = list(additional_metadata) for d in docs: search_doc = {k: d[k] for k in key} metadata = { k: get(d, k) for k in [self.last_updated_field, *additional_metadata, *self.searchable_fields] if has(d, k) } metadata.update(search_doc) data = json.dumps(jsanitize(d, recursive_msonable=True)).encode("UTF-8") if self.compression: data = zlib.compress(data) metadata["compression"] = "zlib" self._collection.put(data, metadata=metadata) search_doc = self.transform_criteria(search_doc) # Cleans up old gridfs entries for fdoc in self._files_collection.find(search_doc, ["_id"]).sort("uploadDate", -1).skip(1): self._collection.delete(fdoc["_id"]) def remove_docs(self, criteria: dict): """ Remove docs matching the query dictionary. Args: criteria: query dictionary to match """ if isinstance(criteria, dict): criteria = self.transform_criteria(criteria) ids = [cursor._id for cursor in self._collection.find(criteria)] for _id in ids: self._collection.delete(_id) def close(self): self._files_store.close() self._coll = None if self.ssh_tunnel is not None: self.ssh_tunnel.stop() def __eq__(self, other: object) -> bool: """ Check equality for GridFSStore other: other GridFSStore to compare with. """ if not isinstance(other, GridFSStore): return False fields = ["database", "collection_name", "host", "port"] return all(getattr(self, f) == getattr(other, f) for f in fields) class GridFSURIStore(GridFSStore): """ A Store for GridFS backend, with connection via a mongo URI string. This is expected to be a special mongodb+srv:// URIs that include client parameters via TXT records """ def __init__( self, uri: str, collection_name: str, database: Optional[str] = None, compression: bool = False, ensure_metadata: bool = False, searchable_fields: Optional[list[str]] = None, mongoclient_kwargs: Optional[dict] = None, **kwargs, ): """ Initializes a GridFS Store for binary data. Args: uri: MongoDB+SRV URI database: database to connect to collection_name: The collection name compression: compress the data as it goes into GridFS ensure_metadata: ensure returned documents have the metadata fields searchable_fields: fields to keep in the index store. """ self.uri = uri # parse the dbname from the uri if database is None: d_uri = uri_parser.parse_uri(uri) if d_uri["database"] is None: raise ConfigurationError("If database name is not supplied, a database must be set in the uri") self.database = d_uri["database"] else: self.database = database self.collection_name = collection_name self._coll: Any = None self.compression = compression self.ensure_metadata = ensure_metadata self.searchable_fields = [] if searchable_fields is None else searchable_fields self.kwargs = kwargs self.mongoclient_kwargs = mongoclient_kwargs or {} if "key" not in kwargs: kwargs["key"] = "_id" super(GridFSStore, self).__init__(**kwargs) # lgtm def connect(self, force_reset: bool = False): """ Connect to the source data. Args: force_reset: whether to reset the connection or not when the Store is already connected. """ if not self._coll or force_reset: # pragma: no cover conn: MongoClient = MongoClient(self.uri, **self.mongoclient_kwargs) db = conn[self.database] self._coll = gridfs.GridFS(db, self.collection_name) self._files_collection = db[f"{self.collection_name}.files"] self._files_store = MongoStore.from_collection(self._files_collection) self._files_store.last_updated_field = f"metadata.{self.last_updated_field}" self._files_store.key = self.key self._chunks_collection = db[f"{self.collection_name}.chunks"] maggma-0.70.0/src/maggma/stores/mongolike.py000066400000000000000000001036301470132070100207220ustar00rootroot00000000000000""" Module containing various definitions of Stores. Stores are a default access pattern to data and provide various utilities. """ import warnings from collections.abc import Iterator from itertools import chain, groupby from pathlib import Path from typing import Any, Callable, Literal, Optional, Union import bson import mongomock import orjson from monty.dev import requires from monty.io import zopen from monty.json import jsanitize from monty.serialization import loadfn from pydash import get, has, set_ from pymongo import MongoClient, ReplaceOne, uri_parser from pymongo.errors import ConfigurationError, DocumentTooLarge, OperationFailure from ruamel.yaml import YAML from maggma.core import Sort, Store, StoreError from maggma.stores.ssh_tunnel import SSHTunnel from maggma.utils import confirm_field_index, to_dt try: from montydb import MontyClient, set_storage # type: ignore except ImportError: MontyClient = None class MongoStore(Store): """ A Store that connects to a Mongo collection. """ def __init__( self, database: str, collection_name: str, host: str = "localhost", port: int = 27017, username: str = "", password: str = "", ssh_tunnel: Optional[SSHTunnel] = None, safe_update: bool = False, auth_source: Optional[str] = None, mongoclient_kwargs: Optional[dict] = None, default_sort: Optional[dict[str, Union[Sort, int]]] = None, **kwargs, ): """ Args: database: The database name collection_name: The collection name host: Hostname for the database port: TCP port to connect to username: Username for the collection password: Password to connect with safe_update: fail gracefully on DocumentTooLarge errors on update auth_source: The database to authenticate on. Defaults to the database name. default_sort: Default sort field and direction to use when querying. Can be used to ensure determinacy in query results. """ self.database = database self.collection_name = collection_name self.host = host self.port = port self.username = username self.password = password self.ssh_tunnel = ssh_tunnel self.safe_update = safe_update self.default_sort = default_sort self._coll = None # type: ignore self.kwargs = kwargs if auth_source is None: auth_source = self.database self.auth_source = auth_source self.mongoclient_kwargs = mongoclient_kwargs or {} super().__init__(**kwargs) @property def name(self) -> str: """ Return a string representing this data source. """ return f"mongo://{self.host}/{self.database}/{self.collection_name}" def connect(self, force_reset: bool = False): """ Connect to the source data. Args: force_reset: whether to reset the connection or not when the Store is already connected. """ if self._coll is None or force_reset: if self.ssh_tunnel is None: host = self.host port = self.port else: self.ssh_tunnel.start() host, port = self.ssh_tunnel.local_address conn: MongoClient = ( MongoClient( host=host, port=port, username=self.username, password=self.password, authSource=self.auth_source, **self.mongoclient_kwargs, ) if self.username != "" else MongoClient(host, port, **self.mongoclient_kwargs) ) db = conn[self.database] self._coll = db[self.collection_name] # type: ignore def __hash__(self) -> int: """Hash for MongoStore.""" return hash((self.database, self.collection_name, self.last_updated_field)) @classmethod def from_db_file(cls, filename: str, **kwargs): """ Convenience method to construct MongoStore from db_file from old QueryEngine format. """ kwargs = loadfn(filename) if "collection" in kwargs: kwargs["collection_name"] = kwargs.pop("collection") # Get rid of aliases from traditional query engine db docs kwargs.pop("aliases", None) return cls(**kwargs) @classmethod def from_launchpad_file(cls, lp_file, collection_name, **kwargs): """ Convenience method to construct MongoStore from a launchpad file. Note: A launchpad file is a special formatted yaml file used in fireworks Returns: """ with open(lp_file) as f: yaml = YAML(typ="safe", pure=True) lp_creds = yaml.load(f.read()) db_creds = lp_creds.copy() db_creds["database"] = db_creds["name"] for key in list(db_creds.keys()): if key not in ["database", "host", "port", "username", "password"]: db_creds.pop(key) db_creds["collection_name"] = collection_name return cls(**db_creds, **kwargs) def distinct(self, field: str, criteria: Optional[dict] = None, all_exist: bool = False) -> list: """ Get all distinct values for a field. Args: field: the field(s) to get distinct values for criteria: PyMongo filter for documents to search in """ criteria = criteria or {} try: distinct_vals = self._collection.distinct(field, criteria) except (OperationFailure, DocumentTooLarge): distinct_vals = [ d["_id"] for d in self._collection.aggregate([{"$match": criteria}, {"$group": {"_id": f"${field}"}}]) ] if all(isinstance(d, list) for d in filter(None, distinct_vals)): # type: ignore distinct_vals = list(chain.from_iterable(filter(None, distinct_vals))) return distinct_vals if distinct_vals is not None else [] def groupby( self, keys: Union[list[str], str], criteria: Optional[dict] = None, properties: Union[dict, list, None] = None, sort: Optional[dict[str, Union[Sort, int]]] = None, skip: int = 0, limit: int = 0, ) -> Iterator[tuple[dict, list[dict]]]: """ Simple grouping function that will group documents by keys. Args: keys: fields to group documents criteria: PyMongo filter for documents to search in properties: properties to return in grouped documents sort: Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending. skip: number documents to skip limit: limit on total number of documents returned Returns: generator returning tuples of (key, list of docs) """ pipeline = [] if isinstance(keys, str): keys = [keys] if properties is None: properties = [] if isinstance(properties, dict): properties = list(properties.keys()) if criteria is not None: pipeline.append({"$match": criteria}) if len(properties) > 0: pipeline.append({"$project": {p: 1 for p in properties + keys}}) alpha = "abcdefghijklmnopqrstuvwxyz" group_id = {letter: f"${key}" for letter, key in zip(alpha, keys)} pipeline.append({"$group": {"_id": group_id, "docs": {"$push": "$$ROOT"}}}) for d in self._collection.aggregate(pipeline, allowDiskUse=True): id_doc = {} # type: ignore for letter, key in group_id.items(): if has(d["_id"], letter): set_(id_doc, key[1:], d["_id"][letter]) yield (id_doc, d["docs"]) @classmethod def from_collection(cls, collection): """ Generates a MongoStore from a pymongo collection object This is not a fully safe operation as it gives dummy information to the MongoStore As a result, this will not serialize and can not reset its connection. Args: collection: the PyMongo collection to create a MongoStore around """ # TODO: How do we make this safer? coll_name = collection.name db_name = collection.database.name store = cls(db_name, coll_name) store._coll = collection return store @property def _collection(self): """Property referring to underlying pymongo collection.""" if self._coll is None: raise StoreError("Must connect Mongo-like store before attempting to use it") return self._coll def count( self, criteria: Optional[dict] = None, hint: Optional[dict[str, Union[Sort, int]]] = None, ) -> int: """ Counts the number of documents matching the query criteria. Args: criteria: PyMongo filter for documents to count in hint: Dictionary of indexes to use as hints for query optimizer. Keys are field names and values are 1 for ascending or -1 for descending. """ criteria = criteria if criteria else {} hint_list = ( [(k, Sort(v).value) if isinstance(v, int) else (k, v.value) for k, v in hint.items()] if hint else None ) if hint_list is not None: # pragma: no cover return self._collection.count_documents(filter=criteria, hint=hint_list) return ( self._collection.count_documents(filter=criteria) if criteria else self._collection.estimated_document_count() ) def query( # type: ignore self, criteria: Optional[dict] = None, properties: Union[dict, list, None] = None, sort: Optional[dict[str, Union[Sort, int]]] = None, hint: Optional[dict[str, Union[Sort, int]]] = None, skip: int = 0, limit: int = 0, **kwargs, ) -> Iterator[dict]: """ Queries the Store for a set of documents. Args: criteria: PyMongo filter for documents to search in properties: properties to return in grouped documents sort: Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending. hint: Dictionary of indexes to use as hints for query optimizer. Keys are field names and values are 1 for ascending or -1 for descending. skip: number documents to skip limit: limit on total number of documents returned mongoclient_kwargs: Dict of extra kwargs to pass to pymongo find. """ if isinstance(properties, list): properties = {p: 1 for p in properties} default_sort_formatted = None if self.default_sort is not None: default_sort_formatted = [ (k, Sort(v).value) if isinstance(v, int) else (k, v.value) for k, v in self.default_sort.items() ] sort_list = ( [(k, Sort(v).value) if isinstance(v, int) else (k, v.value) for k, v in sort.items()] if sort else default_sort_formatted ) hint_list = ( [(k, Sort(v).value) if isinstance(v, int) else (k, v.value) for k, v in hint.items()] if hint else None ) yield from self._collection.find( filter=criteria, projection=properties, skip=skip, limit=limit, sort=sort_list, hint=hint_list, **kwargs, ) def ensure_index(self, key: str, unique: Optional[bool] = False) -> bool: """ Tries to create an index and return true if it succeeded. Args: key: single key to index unique: Whether or not this index contains only unique keys. Returns: bool indicating if the index exists/was created """ if confirm_field_index(self._collection, key): return True try: self._collection.create_index(key, unique=unique, background=True) return True except Exception: return False def update(self, docs: Union[list[dict], dict], key: Union[list, str, None] = None): """ Update documents into the Store. Args: docs: the document or list of documents to update key: field name(s) to determine uniqueness for a document, can be a list of multiple fields, a single field, or None if the Store's key field is to be used """ requests = [] if not isinstance(docs, list): docs = [docs] for d in (jsanitize(x, allow_bson=True, recursive_msonable=True) for x in docs): # document-level validation is optional validates = True if self.validator: validates = self.validator.is_valid(d) if not validates: if self.validator.strict: raise ValueError(self.validator.validation_errors(d)) self.logger.error(self.validator.validation_errors(d)) if validates: key = key or self.key search_doc = {k: d[k] for k in key} if isinstance(key, list) else {key: d[key]} requests.append(ReplaceOne(search_doc, d, upsert=True)) if len(requests) > 0: try: self._collection.bulk_write(requests, ordered=False) except (OperationFailure, DocumentTooLarge) as e: if self.safe_update: for req in requests: try: self._collection.bulk_write([req], ordered=False) except (OperationFailure, DocumentTooLarge): self.logger.error( f"Could not upload document for {req._filter} as it was too large for Mongo" ) else: raise e def remove_docs(self, criteria: dict): """ Remove docs matching the query dictionary. Args: criteria: query dictionary to match """ self._collection.delete_many(filter=criteria) def close(self): """Close up all collections.""" self._collection.database.client.close() self._coll = None if self.ssh_tunnel is not None: self.ssh_tunnel.stop() def __eq__(self, other: object) -> bool: """ Check equality for MongoStore other: other mongostore to compare with. """ if not isinstance(other, MongoStore): return False fields = ["database", "collection_name", "host", "port", "last_updated_field"] return all(getattr(self, f) == getattr(other, f) for f in fields) class MongoURIStore(MongoStore): """ A Store that connects to a Mongo collection via a URI This is expected to be a special mongodb+srv:// URIs that include client parameters via TXT records. """ def __init__( self, uri: str, collection_name: str, database: Optional[str] = None, ssh_tunnel: Optional[SSHTunnel] = None, mongoclient_kwargs: Optional[dict] = None, default_sort: Optional[dict[str, Union[Sort, int]]] = None, **kwargs, ): """ Args: uri: MongoDB+SRV URI database: database to connect to collection_name: The collection name default_sort: Default sort field and direction to use when querying. Can be used to ensure determinacy in query results. """ self.uri = uri self.ssh_tunnel = ssh_tunnel self.default_sort = default_sort self.mongoclient_kwargs = mongoclient_kwargs or {} # parse the dbname from the uri if database is None: d_uri = uri_parser.parse_uri(uri) if d_uri["database"] is None: raise ConfigurationError("If database name is not supplied, a database must be set in the uri") self.database = d_uri["database"] else: self.database = database self.collection_name = collection_name self.kwargs = kwargs self._coll = None super(MongoStore, self).__init__(**kwargs) # lgtm @property def name(self) -> str: """ Return a string representing this data source. """ # TODO: This is not very safe since it exposes the username/password info return self.uri def connect(self, force_reset: bool = False): """ Connect to the source data. Args: force_reset: whether to reset the connection or not when the Store is already connected. """ if self._coll is None or force_reset: # pragma: no cover conn: MongoClient = MongoClient(self.uri, **self.mongoclient_kwargs) db = conn[self.database] self._coll = db[self.collection_name] # type: ignore class MemoryStore(MongoStore): """ An in-memory Store that functions similarly to a MongoStore. """ def __init__(self, collection_name: str = "memory_db", **kwargs): """ Initializes the Memory Store. Args: collection_name: name for the collection in memory. """ self.collection_name = collection_name self.default_sort = None self._coll = None self.kwargs = kwargs super(MongoStore, self).__init__(**kwargs) def connect(self, force_reset: bool = False): """ Connect to the source data. Args: force_reset: whether to reset the connection or not when the Store is already connected. """ if self._coll is None or force_reset: self._coll = mongomock.MongoClient().db[self.name] # type: ignore def close(self): """Close up all collections.""" self._coll.database.client.close() @property def name(self): """Name for the store.""" return f"mem://{self.collection_name}" def __hash__(self): """Hash for the store.""" return hash((self.name, self.last_updated_field)) def groupby( self, keys: Union[list[str], str], criteria: Optional[dict] = None, properties: Union[dict, list, None] = None, sort: Optional[dict[str, Union[Sort, int]]] = None, skip: int = 0, limit: int = 0, ) -> Iterator[tuple[dict, list[dict]]]: """ Simple grouping function that will group documents by keys. Args: keys: fields to group documents criteria: PyMongo filter for documents to search in properties: properties to return in grouped documents sort: Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending. skip: number documents to skip limit: limit on total number of documents returned Returns: generator returning tuples of (key, list of elements) """ keys = keys if isinstance(keys, list) else [keys] if properties is None: properties = [] if isinstance(properties, dict): properties = list(properties.keys()) data = [ doc for doc in self.query(properties=keys + properties, criteria=criteria) if all(has(doc, k) for k in keys) ] def grouping_keys(doc): return tuple(get(doc, k) for k in keys) for vals, group in groupby(sorted(data, key=grouping_keys), key=grouping_keys): doc = {} # type: ignore for k, v in zip(keys, vals): set_(doc, k, v) yield doc, list(group) def __eq__(self, other: object) -> bool: """ Check equality for MemoryStore other: other MemoryStore to compare with. """ if not isinstance(other, MemoryStore): return False fields = ["collection_name", "last_updated_field"] return all(getattr(self, f) == getattr(other, f) for f in fields) class JSONStore(MemoryStore): """ A Store for access to a single or multiple JSON files. """ def __init__( self, paths: Union[str, list[str]], read_only: bool = True, serialization_option: Optional[int] = None, serialization_default: Optional[Callable[[Any], Any]] = None, encoding: Optional[str] = None, **kwargs, ): """ Args: paths: paths for json files to turn into a Store read_only: whether this JSONStore is read only. When read_only=True, the JSONStore can still apply MongoDB-like writable operations (e.g. an update) because it behaves like a MemoryStore, but it will not write those changes to the file. On the other hand, if read_only=False (i.e., it is writeable), the JSON file will be automatically updated every time a write-like operation is performed. Note that when read_only=False, JSONStore only supports a single JSON file. If the file does not exist, it will be automatically created when the JSONStore is initialized. serialization_option: option that will be passed to the orjson.dump when saving to the json the file. serialization_default: default that will be passed to the orjson.dump when saving to the json the file. encoding: Character encoding of files to be tracked by the store. The default (None) follows python's default behavior, which is to determine the character encoding from the platform. This should work in the great majority of cases. However, if you encounter a UnicodeDecodeError, consider setting the encoding explicitly to 'utf8' or another encoding as appropriate. """ paths = paths if isinstance(paths, (list, tuple)) else [paths] self.paths = paths self.encoding = encoding # file_writable overrides read_only for compatibility reasons if "file_writable" in kwargs: file_writable = kwargs.pop("file_writable") warnings.warn( "file_writable is deprecated; use read only instead.", DeprecationWarning, ) self.read_only = not file_writable if self.read_only != read_only: warnings.warn( f"Received conflicting keyword arguments file_writable={file_writable}" f" and read_only={read_only}. Setting read_only={file_writable}.", UserWarning, ) else: self.read_only = read_only self.kwargs = kwargs if not self.read_only and len(paths) > 1: raise RuntimeError("Cannot instantiate file-writable JSONStore with multiple JSON files.") self.default_sort = None self.serialization_option = serialization_option self.serialization_default = serialization_default super().__init__(**kwargs) def connect(self, force_reset: bool = False): """ Loads the files into the collection in memory. Args: force_reset: whether to reset the connection or not. If False (default) and .connect() has been called previously, the .json file will not be read in again. This can improve performance on systems with slow storage when multiple connect / disconnects are performed. """ if self._coll is None or force_reset: self._coll = mongomock.MongoClient().db[self.name] # type: ignore # create the .json file if it does not exist if not self.read_only and not Path(self.paths[0]).exists(): with zopen(self.paths[0], "w", encoding=self.encoding) as f: data: list[dict] = [] bytesdata = orjson.dumps(data) f.write(bytesdata.decode("utf-8")) for path in self.paths: objects = self.read_json_file(path) try: self.update(objects) except KeyError: raise KeyError( f""" Key field '{self.key}' not found in {path.name}. This could mean that this JSONStore was initially created with a different key field. The keys found in the .json file are {list(objects[0].keys())}. Try re-initializing your JSONStore using one of these as the key arguments. """ ) def read_json_file(self, path) -> list: """ Helper method to read the contents of a JSON file and generate a list of docs. Args: path: Path to the JSON file to be read """ with zopen(path) as f: data = f.read() data = data.decode() if isinstance(data, bytes) else data objects = bson.json_util.loads(data) if "$oid" in data else orjson.loads(data) objects = [objects] if not isinstance(objects, list) else objects # datetime objects deserialize to str. Try to convert the last_updated # field back to datetime. # # TODO - there may still be problems caused if a JSONStore is init'ed from # documents that don't contain a last_updated field # See Store.last_updated in store.py. for obj in objects: if obj.get(self.last_updated_field): obj[self.last_updated_field] = to_dt(obj[self.last_updated_field]) return objects def update(self, docs: Union[list[dict], dict], key: Union[list, str, None] = None): """ Update documents into the Store. For a file-writable JSONStore, the json file is updated. Args: docs: the document or list of documents to update key: field name(s) to determine uniqueness for a document, can be a list of multiple fields, a single field, or None if the Store's key field is to be used """ super().update(docs=docs, key=key) if not self.read_only: self.update_json_file() def remove_docs(self, criteria: dict): """ Remove docs matching the query dictionary. For a file-writable JSONStore, the json file is updated. Args: criteria: query dictionary to match """ super().remove_docs(criteria=criteria) if not self.read_only: self.update_json_file() def update_json_file(self): """ Updates the json file when a write-like operation is performed. """ with zopen(self.paths[0], "w", encoding=self.encoding) as f: data = list(self.query()) for d in data: d.pop("_id") bytesdata = orjson.dumps( data, option=self.serialization_option, default=self.serialization_default, ) f.write(bytesdata.decode("utf-8")) def __hash__(self): return hash((*self.paths, self.last_updated_field)) def __eq__(self, other: object) -> bool: """ Check equality for JSONStore. Args: other: other JSONStore to compare with """ if not isinstance(other, JSONStore): return False fields = ["paths", "last_updated_field"] return all(getattr(self, f) == getattr(other, f) for f in fields) @requires( MontyClient is not None, "MontyStore requires MontyDB to be installed. See the MontyDB repository for more " "information: https://github.com/davidlatwe/montydb", ) class MontyStore(MemoryStore): """ A MongoDB compatible store that uses on disk files for storage. This is handled under the hood using MontyDB. A number of on-disk storage options are available but MontyDB provides a mongo style interface for all options. The options include: - sqlite: Uses an sqlite database to store documents. - lightning: Uses Lightning Memory-Mapped Database (LMDB) for storage. This can provide fast read and write times but requires lmdb to be installed (in most cases this can be achieved using ``pip install lmdb``). - flatfile: Uses a system of flat json files. This is not recommended as multiple simultaneous connections to the store will not work correctly. Note that MontyDB (and, therefore, MontyStore) will write out a new database to the disk but cannot be used to read an existing (e.g. SQLite) database that wasn't formatted by MontyDB. See the MontyDB repository for more information: https://github.com/davidlatwe/montydb """ def __init__( self, collection_name, database_path: Optional[str] = None, database_name: str = "db", storage: Literal["sqlite", "flatfile", "lightning"] = "sqlite", storage_kwargs: Optional[dict] = None, client_kwargs: Optional[dict] = None, **kwargs, ): """ Initializes the Monty Store. Args: collection_name: Name for the collection. database_path: Path to on-disk database files. If None, the current working directory will be used. database_name: The database name. storage: The storage type. Options include "sqlite", "lightning", "flatfile". Note that although MontyDB supports in memory storage, this capability is disabled in maggma to avoid unintended behavior, since multiple in-memory MontyStore would actually point to the same data. storage_kwargs: Keyword arguments passed to ``montydb.set_storage``. client_kwargs: Keyword arguments passed to the ``montydb.MontyClient`` constructor. **kwargs: Additional keyword arguments passed to the Store constructor. """ if database_path is None: database_path = str(Path.cwd()) self.database_path = database_path self.database_name = database_name self.collection_name = collection_name self._coll = None # type: ignore self.default_sort = None self.ssh_tunnel = None # This is to fix issues with the tunnel on close self.kwargs = kwargs self.storage = storage self.storage_kwargs = storage_kwargs or { "use_bson": True, # import pymongo's BSON; do not use montydb's "mongo_version": "4.0", } self.client_kwargs = client_kwargs or {} super(MongoStore, self).__init__(**kwargs) def connect(self, force_reset: bool = False): """ Connect to the database store. Args: force_reset: whether to reset the connection or not when the Store is already connected. """ if not self._coll or force_reset: # TODO - workaround, may be obviated by a future montydb update if self.database_path != ":memory:": set_storage(self.database_path, storage=self.storage, **self.storage_kwargs) client = MontyClient(self.database_path, **self.client_kwargs) self._coll = client[self.database_name][self.collection_name] @property def name(self) -> str: """Return a string representing this data source.""" return f"monty://{self.database_path}/{self.database_name}/{self.collection_name}" def count( self, criteria: Optional[dict] = None, hint: Optional[dict[str, Union[Sort, int]]] = None, ) -> int: """ Counts the number of documents matching the query criteria. Args: criteria: PyMongo filter for documents to count in hint: Dictionary of indexes to use as hints for query optimizer. Keys are field names and values are 1 for ascending or -1 for descending. """ criteria = criteria if criteria else {} hint_list = ( [(k, Sort(v).value) if isinstance(v, int) else (k, v.value) for k, v in hint.items()] if hint else None ) if hint_list is not None: # pragma: no cover return self._collection.count_documents(filter=criteria, hint=hint_list) return self._collection.count_documents(filter=criteria) def update(self, docs: Union[list[dict], dict], key: Union[list, str, None] = None): """ Update documents into the Store. Args: docs: The document or list of documents to update. key: Field name(s) to determine uniqueness for a document, can be a list of multiple fields, a single field, or None if the Store's key field is to be used. """ if not isinstance(docs, list): docs = [docs] for d in docs: d = jsanitize(d, allow_bson=True) # document-level validation is optional validates = True if self.validator: validates = self.validator.is_valid(d) if not validates: if self.validator.strict: raise ValueError(self.validator.validation_errors(d)) self.logger.error(self.validator.validation_errors(d)) if validates: key = key or self.key search_doc = {k: d[k] for k in key} if isinstance(key, list) else {key: d[key]} self._collection.replace_one(search_doc, d, upsert=True) maggma-0.70.0/src/maggma/stores/open_data.py000066400000000000000000000756731470132070100207070ustar00rootroot00000000000000import gzip import logging import re from collections.abc import Generator from datetime import datetime from io import BytesIO, StringIO from typing import Optional, Union import jsonlines import numpy as np import pandas as pd from boto3 import client as boto_client from botocore import UNSIGNED from botocore.config import Config from botocore.exceptions import ClientError from bson import json_util from maggma.core.store import Sort from maggma.utils import LU_KEY_ISOFORMAT def chunker(df: pd.DataFrame, chunk_size: int) -> Generator[pd.DataFrame, None, None]: """ Creates a generator for a DataFrame to allow chunk processing. Args: df: the DataFrame to chunk chunk_size: size of the chunks Returns: Generator[pd.DataFrame, None, None]: a generator for a DataFrame to allow chunk processing. """ return (df.iloc[pos : pos + chunk_size] for pos in range(0, len(df), chunk_size)) class PandasMemoryStore: """ A store that is backed by Pandas DataFrame. """ def __init__( self, key: str = "task_id", last_updated_field: str = "last_updated", ): """ Args: key: main key to index on last_updated_field: field for date/time stamping the data. """ self._data = None self.key = key self.last_updated_field = last_updated_field self.logger = logging.getLogger(type(self).__name__) self.logger.addHandler(logging.NullHandler()) self.logger.warning( "Use all open data stores with caution as they are deprecated and may be incompatible with numpy 2.0+." ) @property def index_data(self): return self._data def set_index_data(self, new_index: pd.DataFrame): self._data = new_index def _verify_criteria(self, criteria: dict) -> tuple[str, str, list]: query_string, is_in_key, is_in_list = "", None, None if criteria and "query" not in criteria and "is_in" not in criteria: raise AttributeError("Pandas memory store only support query or is_in") if criteria and "query" in criteria and "is_in" in criteria: raise AttributeError("Pandas memory store cannot mix query and is_in; please just use one or the other") if criteria: if "is_in" in criteria: is_in_key, is_in_list = criteria["is_in"] query_string = None elif "query" in criteria: query_string = criteria["query"] return query_string, is_in_key, is_in_list def query( self, criteria: Optional[dict] = None, properties: Union[list, None] = None, sort: Optional[dict[str, Union[Sort, int]]] = None, skip: int = 0, limit: int = 0, criteria_fields: Union[list, None] = None, ) -> pd.DataFrame: """ Queries the Store for a set of documents. Args: criteria: if there's a `query` key, it's value will be used as the Pandas string expression to query with; if there's a 'is_in' key, it's value will be used to perform an isin call using the first item in that tuple for the column name and the second item as the list across which to filter on; only one valid key is accepted; all other data in the criteria will be ignored properties: subset of properties to return sort: Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending. skip: number documents to skip (from the start of the result set) limit: limit on total number of documents returned criteria_fields: if this value is not None, the in-memory index will be used for the query if all the "criteria_fields" and "properties" are present in the in-memory index; otherwise will default to querying store type dependent implementation Returns: pd.DataFrame: DataFrame that contains all the documents that match the query parameters Raises: AttributeError: if criteria exists and does not include a valid key; also if more than one valid key is present """ query_string, is_in_key, is_in_list = self._verify_criteria(criteria=criteria) if properties is not None and not isinstance(properties, list): raise AttributeError(f"Pandas query expects properties must be a list and not a {type(properties)}") if self._data is None: return pd.DataFrame() return PandasMemoryStore._query( index=self._data, query_string=query_string, is_in_key=is_in_key, is_in_list=is_in_list, properties=properties, sort=sort, skip=skip, limit=limit, ) @staticmethod def _query( index: pd.DataFrame, query_string: str, is_in_key: str, is_in_list: list, properties: Union[list, None] = None, sort: Optional[dict[str, Union[Sort, int]]] = None, skip: int = 0, limit: int = 0, ) -> pd.DataFrame: ret = index if query_string: ret = ret.query(query_string) elif is_in_key is not None: ret = ret[ret[is_in_key].isin(is_in_list)] if sort: sort_keys, sort_ascending = zip(*[(k, v == 1) for k, v in sort.items()]) ret = ret.sort_values(by=list(sort_keys), ascending=list(sort_ascending)) if properties: ret = ret[properties] ret = ret[skip:] if limit > 0: ret = ret[:limit] return ret def count(self, criteria: Optional[dict] = None, criteria_fields: Union[list, None] = None) -> int: """ Counts the number of documents matching the query criteria. Returns: int: the number of documents matching the query criteria Args: criteria: see `query` method for details on how to construct criteria_fields: see `query` method for details """ return len(self.query(criteria=criteria, criteria_fields=criteria_fields)) def distinct( self, field: str, criteria: Optional[dict] = None, criteria_fields: Union[list, None] = None ) -> pd.Series: """ Get all distinct values for a field. Args: field: the field(s) to get distinct values for criteria: see `query` method for details on how to construct criteria_fields: see `query` method for details Returns: pd.Series: Series of all the distinct values for the provided field (after filtering by the provided criteria) """ ret = self.query(criteria=criteria, properties=[field], criteria_fields=criteria_fields) return ret[field].drop_duplicates() @property def last_updated(self) -> datetime: """ Provides the most recent last_updated date time stamp from the documents in this Store. """ if self._data is None: return datetime.min max = self._data[self.last_updated_field].max() if max is None: return datetime.min return LU_KEY_ISOFORMAT[0](max) def newer_in( self, target: "PandasMemoryStore", criteria: Optional[dict] = None, exhaustive: bool = False, criteria_fields: Union[list, None] = None, ) -> pd.Series: """ Returns the keys of documents that are newer in the target Store than this Store. Args: target: target Store to compare with criteria: see `query` method for details on how to construct exhaustive: triggers an item-by-item check vs. checking the last_updated of the target Store and using that to filter out new items in criteria_fields: see `query` method for details Returns: pd.Series: if no criteria is provided a Series of the keys of documents in the target store whose last updated field value is greater than the 'newest' document in this store; otherwise a list of the keys of documents in the target store that additionally meet the criteria Raises: AttributeError: if the key and last updated fields are not both present in this store or if criteria is provided when exhaustive is not set to True """ if self._data is None: return target.query() if not (self._field_exists(self.key) and self._field_exists(self.last_updated_field)): raise AttributeError("This index store does not contain data with both key and last updated fields") if criteria is not None and not exhaustive: raise AttributeError("Criteria is only considered when doing an item-by-item check") if exhaustive: # Get our current last_updated dates for each key value props = [self.key, self.last_updated_field] dates = { d[self.key]: LU_KEY_ISOFORMAT[0](d.get(self.last_updated_field, datetime.max)) for _, d in self.query(properties=props, criteria_fields=criteria_fields).iterrows() } # Get the last_updated for the store we're comparing with props = [target.key, target.last_updated_field] target_dates = { d[target.key]: LU_KEY_ISOFORMAT[0](d.get(target.last_updated_field, datetime.min)) for _, d in target.query( criteria=criteria, properties=props, criteria_fields=criteria_fields ).iterrows() } new_keys = set(target_dates.keys()) - set(dates.keys()) updated_keys = {key for key, date in dates.items() if target_dates.get(key, datetime.min) > date} return pd.Series(data=list(new_keys | updated_keys), name=self.key) criteria = {"query": f"{self.last_updated_field} > '{LU_KEY_ISOFORMAT[1](self.last_updated)}'"} return target.distinct(field=self.key, criteria=criteria, criteria_fields=[self.last_updated_field]) def get_merged_items(self, to_dt: pd.DataFrame, from_dt: pd.DataFrame) -> pd.DataFrame: orig_columns = to_dt.columns merged = to_dt.merge(from_dt, on=self.key, how="left", suffixes=("", "_B")) for column in from_dt.columns: if column not in self.key: oc_dtype = merged[column].dtype s = merged.pop(column + "_B") s.name = column merged.update(s) merged[column].astype(oc_dtype) return pd.concat( (merged[orig_columns], from_dt[~from_dt.set_index(self.key).index.isin(to_dt.set_index(self.key).index)]), ignore_index=True, ) def update(self, docs: pd.DataFrame) -> pd.DataFrame: """ Update documents into the Store. Args: docs: the document or list of documents to update Returns: pd.DataFrame the updated documents """ if self._data is None: if docs is not None and not docs.empty: self._data = docs return docs self._data = self.get_merged_items(to_dt=self._data, from_dt=docs) return docs def _field_exists(self, key: str) -> bool: return key in self._data def __hash__(self): """Hash for the store.""" return hash((self.key, self.last_updated_field)) def __eq__(self, other: object) -> bool: """ Check equality for PandasMemoryStore other: other PandasMemoryStore to compare with. """ if not isinstance(other, PandasMemoryStore): return False fields = ["key", "last_updated_field"] return all(getattr(self, f) == getattr(other, f) for f in fields) class S3IndexStore(PandasMemoryStore): """ A store that loads the index of the collection from an S3 file. Note that `update` calls will not write changes to S3, only to memory. You must call `store_manifest` to store any updates applied during the session. """ def __init__( self, collection_name: str, bucket: str, prefix: str = "", endpoint_url: Optional[str] = None, manifest_key: str = "manifest.jsonl", **kwargs, ): """Initializes an S3IndexStore. Args: collection_name (str): name of the collection bucket (str): Name of the bucket where the index is stored. prefix (str, optional): The prefix to add to the name of the index, i.e. the manifest key. Defaults to "". endpoint_url (Optional[str], optional): S3-compatible endpoint URL. Defaults to None, indicating to use the default configured AWS S3. manifest_key (str, optional): The name of the index. Defaults to "manifest.jsonl". """ self.collection_name = collection_name self.bucket = bucket self.prefix = prefix if prefix == "" else prefix.rstrip("/") + "/" self.endpoint_url = endpoint_url self.manifest_key = manifest_key self.kwargs = kwargs self._s3_client = None super().__init__(**kwargs) @property def s3_client(self): if self._s3_client is None: self._s3_client = boto_client("s3", endpoint_url=self.endpoint_url) return self._s3_client def connect(self): """ Sets up the S3 client and loads the contents of the index stored in S3 into memory. This will overwrite the local memory with the S3 data. """ try: self.s3_client.head_bucket(Bucket=self.bucket) except ClientError: raise RuntimeError(f"Bucket not present on AWS: {self.bucket}") # load index self.set_index_data(self.retrieve_manifest()) def close(self): """Closes any connections.""" if self._s3_client is not None: self._s3_client.close() self._s3_client = None def retrieve_manifest(self) -> pd.DataFrame: """Retrieves the contents of the index stored in S3. Returns: pd.DataFrame: The index contents read from the manifest file. Returns None if a manifest file does not exist. """ try: response = self.s3_client.get_object(Bucket=self.bucket, Key=self._get_manifest_full_key_path()) df = pd.read_json(response["Body"], orient="records", lines=True) return df.map( lambda x: datetime.fromisoformat(x["$date"].rstrip("Z")) if isinstance(x, dict) and "$date" in x else x ) except ClientError as ex: if ex.response["Error"]["Code"] == "NoSuchKey": return None raise def _get_manifest_full_key_path(self) -> str: """Produces the full path for the index.""" return f"{self.prefix}{self.manifest_key}" def store_manifest(self) -> None: """Stores the existing data into the index stored in S3. This overwrites and fully replaces all of the contents of the previous index stored in S3 with the current contents of the memory index. """ string_io = StringIO() with jsonlines.Writer(string_io, dumps=json_util.dumps) as writer: for _, row in self._data.iterrows(): writer.write(row.to_dict()) self.s3_client.put_object( Bucket=self.bucket, Body=BytesIO(string_io.getvalue().encode("utf-8")), Key=self._get_manifest_full_key_path(), ) def __getstate__(self): # Return the object's state excluding the _s3_client attribute state = self.__dict__.copy() state["_s3_client"] = None # Exclude the client from serialization return state def __setstate__(self, state): # Restore instance attributes (excluding the client) self.__dict__.update(state) # Initialize the client as None; it will be recreated on demand self._s3_client = None def __hash__(self): return hash((self.collection_name, self.bucket, self.prefix, self.endpoint_url, self.manifest_key)) def __eq__(self, other: object) -> bool: """ Check equality for S3Store other: other S3Store to compare with. """ if not isinstance(other, S3IndexStore): return False fields = ["collection_name", "bucket", "prefix", "endpoint_url", "manifest_key", "last_updated_field"] return all(getattr(self, f) == getattr(other, f) for f in fields) class OpenDataStore(S3IndexStore): """ Data is stored on S3 compatible storage using the format used by Materials Project on OpenData. The index is loaded from S3 compatible storage into memory. Note that updates will only affect the in-memory representation of the index - they will not be persisted. To persist index writes utilize the `store_manifest` function. This Store should not be used for applications that are distributed and rely on reading updated values from the index as data inconsistencied will arise. """ def __init__( self, index: S3IndexStore = None, # set _index to this and create property searchable_fields: Optional[list[str]] = None, object_file_extension: str = ".jsonl.gz", access_as_public_bucket: bool = False, object_grouping: Optional[list[str]] = None, **kwargs, ): """Initializes an OpenDataStore. Args: index (S3IndexStore): The store that'll be used as the index, ie for queries pertaining to this store. If None, will create index from manifest located in same location as the data. searchable_fields: additional fields to keep in the index store. `key`, `last_updated_field` and the fields in `object_grouping` are already added to the index by default object_file_extension (str, optional): The extension used for the data stored in S3. Defaults to ".jsonl.gz". access_as_public_bucket (bool, optional): If True, the S3 bucket will be accessed without signing, ie as if it's a public bucket. This is useful for end users. Defaults to False. """ self._index = index self.searchable_fields = searchable_fields if searchable_fields is not None else [] self.object_file_extension = object_file_extension self.access_as_public_bucket = access_as_public_bucket self.object_grouping = object_grouping if object_grouping is not None else ["nelements", "symmetry_number"] if access_as_public_bucket: kwargs["s3_resource_kwargs"] = kwargs.get("s3_resource_kwargs", {}) kwargs["s3_resource_kwargs"]["config"] = Config(signature_version=UNSIGNED) super().__init__(**kwargs) self.searchable_fields = list( set(self.object_grouping) | set(self.searchable_fields) | {self.key, self.last_updated_field} ) @property def index(self): if self._index is None: return super() return self._index def update( self, docs: pd.DataFrame, ) -> pd.DataFrame: """ Update documents in S3 and local in-memory index. Args: docs: the documents to update Returns: pd.DataFrame the index for the updated documents """ # group docs to update by object grouping docs_by_group = self._json_normalize_and_filter(docs=docs).groupby(self.object_grouping) existing_index = self.index.index_data ret = [] for group, group_docs_index in docs_by_group: query_str = " and ".join([f"{col} == {val!r}" for col, val in zip(self.object_grouping, group)]) group_docs = docs[docs[self.key].isin(group_docs_index[self.key].to_list())] merged_docs, merged_index = group_docs, group_docs_index if existing_index is not None: # fetch subsection of existing and docs_df and do outer merge with indicator=True sub_existing = existing_index.query(query_str) merged_index = self.get_merged_items(to_dt=sub_existing, from_dt=group_docs_index) # if there's any rows in existing only need to fetch the S3 data and merge that in if (~sub_existing[self.key].isin(group_docs_index[self.key])).any(): ## fetch the S3 data and populate those rows in sub_docs_df s3_docs = self._read_doc_from_s3(self._get_full_key_path(sub_existing)) merged_docs = self.get_merged_items(to_dt=s3_docs, from_dt=group_docs) # write doc based on subsection self._write_doc_and_update_index(merged_docs, merged_index) ret.append(merged_index) return pd.concat(ret) def query( self, criteria: Optional[dict] = None, properties: Union[list, None] = None, sort: Optional[dict[str, Union[Sort, int]]] = None, skip: int = 0, limit: int = 0, criteria_fields: Union[list, None] = None, ) -> pd.DataFrame: """ Queries the Store for a set of documents. Args: criteria: if there's a `query` key, it's value will be used as the Pandas string expression to query with; if there's a 'is_in' key, it's value will be used to perform an isin call using the first item in that tuple for the column name and the second item as the list across which to filter on; only one valid key is accepted; all other data in the criteria will be ignored properties: subset of properties to return sort: Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending. skip: number documents to skip (from the start of the result set) limit: limit on total number of documents returned criteria_fields: if this value is not None, the in-memory index will be used for the query if all the "criteria_fields" and "properties" are present in the in-memory index; otherwise will default to querying against the S3 docs Returns: pd.DataFrame: DataFrame that contains all the documents that match the query parameters Raises: AttributeError: if criteria exists and does not include a valid key; also if more than one valid key is present """ query_string, is_in_key, is_in_list = self._verify_criteria(criteria=criteria) if properties is not None and not isinstance(properties, list): raise AttributeError(f"OpenData query expects properties must be a list and not a {type(properties)}") if self.index.index_data is None: return pd.DataFrame() # optimization if all required fields are in the index if criteria_fields is not None and properties is not None: query_fields = set(criteria_fields) | set(properties) if all(item in self.index.index_data.columns for item in list(query_fields)): return self.index.query(criteria=criteria, properties=properties, sort=sort, skip=skip, limit=limit) results = [] for _, docs in self.index.index_data.groupby(self.object_grouping): results.append( PandasMemoryStore._query( index=self._read_doc_from_s3(self._get_full_key_path(docs)), query_string=query_string, is_in_key=is_in_key, is_in_list=is_in_list, properties=properties, sort=sort, skip=skip, limit=limit, ) ) return pd.concat(results, ignore_index=True) def _get_full_key_path(self, index: pd.DataFrame) -> str: id = "" for group in self.object_grouping: id = f"{id}{group}={index[group].iloc[0]}/" id = id.rstrip("/") return f"{self.prefix}{id}{self.object_file_extension}" def _gather_indexable_data(self, df: pd.DataFrame) -> pd.DataFrame: return self._json_normalize_and_filter(df) def _json_normalize_and_filter(self, docs: pd.DataFrame) -> pd.DataFrame: dfs = [] for chunk in chunker(df=docs, chunk_size=1000): dfs.append(pd.json_normalize(chunk.to_dict(orient="records"), sep="_")[self.searchable_fields]) return pd.concat(dfs, ignore_index=True) def _write_doc_and_update_index(self, items: pd.DataFrame, index: pd.DataFrame) -> None: self._write_doc_to_s3(items, index) self.index.update(index) def _write_doc_to_s3(self, doc: pd.DataFrame, index: pd.DataFrame) -> None: doc = doc.replace({pd.NaT: None}).replace({"NaT": None}).replace({np.nan: None}) string_io = StringIO() with jsonlines.Writer(string_io, dumps=json_util.dumps) as writer: for _, row in doc.iterrows(): writer.write(row.to_dict()) data = gzip.compress(string_io.getvalue().encode("utf-8")) self.s3_client.upload_fileobj( Bucket=self.bucket, Fileobj=BytesIO(data), Key=self._get_full_key_path(index), ) def _read_doc_from_s3(self, file_id: str) -> pd.DataFrame: try: response = self.s3_client.get_object(Bucket=self.bucket, Key=file_id) df = pd.read_json(response["Body"], orient="records", lines=True, compression={"method": "gzip"}) def replace_nested_date_dict(obj): if isinstance(obj, dict): if "$date" in obj: # Return the datetime string or convert it to a datetime object return datetime.fromisoformat(obj["$date"].rstrip("Z")) # Recursively process each key-value pair in the dictionary for key, value in obj.items(): obj[key] = replace_nested_date_dict(value) elif isinstance(obj, list): # Process each item in the list return [replace_nested_date_dict(item) for item in obj] return obj return df.map(replace_nested_date_dict) except ClientError as ex: if ex.response["Error"]["Code"] == "NoSuchKey": return pd.DataFrame() raise def _index_for_doc_from_s3(self, key: str) -> pd.DataFrame: doc = self._read_doc_from_s3(key) return self._gather_indexable_data(doc) def rebuild_index_from_s3_data(self) -> pd.DataFrame: """ Rebuilds the index Store from the data in S3 Stores only the searchable_fields in the index. Only updates the in-memory index and does not persist the index; please call `store_manifest` with the returned values to persist. Returns: pd.DataFrame: The set of docs representing the index data. """ paginator = self.s3_client.get_paginator("list_objects_v2") # Create a PageIterator from the Paginator page_iterator = paginator.paginate(Bucket=self.bucket, Prefix=self.prefix) all_index_docs = [] for page in page_iterator: for file in page["Contents"]: key = file["Key"] if key != self.index._get_manifest_full_key_path() and key.endswith(self.object_file_extension): all_index_docs.append(self._index_for_doc_from_s3(key)) ret = pd.concat(all_index_docs, ignore_index=True) self.index.set_index_data(ret) return ret def rebuild_index_from_data(self, docs: pd.DataFrame) -> pd.DataFrame: """ Rebuilds the index Store from the provided data. The provided data needs to include all of the documents in this data set. Stores only the searchable_fields in the index. Only updates the in-memory index and does not persist the index; please call `store_manifest` with the returned values to persist. Returns: pd.DataFrame: The set of docs representing the index data. """ ret = self._gather_indexable_data(docs) self.index.set_index_data(ret) return ret def __getstate__(self): # Return the object's state excluding the _s3_client attribute state = self.__dict__.copy() state["_s3_client"] = None # Exclude the client from serialization return state def __setstate__(self, state): # Restore instance attributes (excluding the client) self.__dict__.update(state) # Initialize the client as None; it will be recreated on demand self._s3_client = None def __hash__(self): return hash( ( self.bucket, self.endpoint_url, self.key, self.prefix, tuple(self.object_grouping), tuple(self.searchable_fields), ) ) def __eq__(self, other: object) -> bool: """ Check equality for OpenDataStore. other: other OpenDataStore to compare with. """ if not isinstance(other, OpenDataStore): return False fields = [ "_index", "bucket", "endpoint_url", "key", "searchable_fields", "prefix", "last_updated_field", "object_grouping", ] return all(getattr(self, f) == getattr(other, f) for f in fields) class TasksOpenDataStore(OpenDataStore): """ Task data is stored on S3 compatible storage using the format used by Materials Project on OpenData. The index is loaded from S3 compatible storage into memory. Note that updates will only affect the in-memory representation of the index - they will not be persisted. To persist index writes utilize the `store_manifest` function. This Store should not be used for applications that are distributed and rely on reading updated values from the index as data inconsistencied will arise. """ def __init__( self, **kwargs, ): """Initializes a TaskOpenDataStore.""" super().__init__(**kwargs) def _index_for_doc_from_s3(self, key: str) -> pd.DataFrame: doc = self._read_doc_from_s3(key) # create an entry for the trailing object grouping field col = self.object_grouping[-1] val = re.search(rf"{col}=(.+)\.jsonl\.gz", key).group(1) doc[col] = val return self._gather_indexable_data(doc) def update(self, docs: pd.DataFrame) -> pd.DataFrame: raise NotImplementedError("update is not supported for this store") maggma-0.70.0/src/maggma/stores/shared_stores.py000066400000000000000000000516501470132070100216070ustar00rootroot00000000000000from collections.abc import Iterator from functools import partial from multiprocessing.managers import BaseManager from threading import Lock from typing import Any, Callable, Optional, Union from monty.json import MontyDecoder from maggma.core.store import Sort, Store class StoreFacade(Store): """ This class provides a way to access a single store within a MultiStore with the same attributes as any ordinary maggma store. ``` # Create the multistore multistore = MultiStore() # Add a store to the multistore and create a facade to access it first_store = StoreFacade(MongoStore(..., collection_name="collection_one"), multistore) # Add a second store to the multistore and create a facade to access it second_store = StoreFacade(MongoStore(..., collection_name="collection_two"), multistore) # Attempt to add a duplicate store and create a facade to access an equivalent store third_store = StoreFacade(MongoStore(..., collection_name="collection_two"), multistore) multistore.count_stores() # Returns 2, since only 2 unique stores were added # We can then use the stores as we would normally do first_store.query(criteria={}, properties=[]) second_store.update() third_store.remove_docs() ``` """ def __init__(self, store, multistore): # Keep track of this store for the purposes of checking # equality, but it will never be connected to self.store = store self.multistore = multistore self.multistore.ensure_store(self.store) def __getattr__(self, name: str) -> Any: if name not in dir(self): return self.multistore._proxy_attribute(name, self.store) return None def __setattr__(self, name: str, value: Any): if name not in ["store", "multistore"]: self.multistore.set_store_attribute(self.store, name, value) else: super().__setattr__(name, value) @property def _collection(self): """ Returns a handle to the pymongo collection object. """ return self.multistore.store_collection(self.store) @property def name(self) -> str: """ Return a string representing this data source. """ return self.multistore.store_name(self.store) def connect(self, force_reset: bool = False): """ Connect to the source data. Args: force_reset: whether to reset the connection or not when the Store is already connected. """ self.multistore.connect(self.store, force_reset=force_reset) def close(self): """ Closes any connections. """ self.multistore.close(self.store) def count(self, criteria: Optional[dict] = None) -> int: """ Counts the number of documents matching the query criteria. Args: criteria: PyMongo filter for documents to count in """ return self.multistore.count(self.store, criteria=criteria) def query( self, criteria: Optional[dict] = None, properties: Union[dict, list, None] = None, sort: Optional[dict[str, Union[Sort, int]]] = None, skip: int = 0, limit: int = 0, ) -> Iterator[dict]: """ Queries the Store for a set of documents. Args: criteria: PyMongo filter for documents to search in properties: properties to return in grouped documents sort: Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending. skip: number documents to skip limit: limit on total number of documents returned """ return self.multistore.query( self.store, criteria=criteria, properties=properties, sort=sort, skip=skip, limit=limit, ) def update(self, docs: Union[list[dict], dict], key: Union[list, str, None] = None, **kwargs): """ Update documents into the Store. Args: docs: the document or list of documents to update key: field name(s) to determine uniqueness for a document, can be a list of multiple fields, a single field, or None if the Store's key field is to be used """ return self.multistore.update(self.store, docs=docs, key=key, **kwargs) def ensure_index(self, key: str, unique: bool = False, **kwargs) -> bool: """ Tries to create an index and return true if it succeeded. Args: key: single key to index unique: Whether or not this index contains only unique keys Returns: bool indicating if the index exists/was created """ return self.multistore.ensure_index(self.store, key=key, unique=unique, **kwargs) def groupby( self, keys: Union[list[str], str], criteria: Optional[dict] = None, properties: Union[dict, list, None] = None, sort: Optional[dict[str, Union[Sort, int]]] = None, skip: int = 0, limit: int = 0, **kwargs, ) -> Iterator[tuple[dict, list[dict]]]: """ Simple grouping function that will group documents by keys. Args: keys: fields to group documents criteria: PyMongo filter for documents to search in properties: properties to return in grouped documents sort: Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending. skip: number documents to skip limit: limit on total number of documents returned Returns: generator returning tuples of (dict, list of docs) """ return self.multistore.groupby( self.store, keys=keys, criteria=criteria, properties=properties, sort=sort, skip=skip, limit=limit, **kwargs ) def remove_docs(self, criteria: dict, **kwargs): """ Remove docs matching the query dictionary. Args: criteria: query dictionary to match """ return self.multistore.remove_docs(self.store, criteria=criteria, **kwargs) def query_one( self, criteria: Optional[dict] = None, properties: Union[dict, list, None] = None, sort: Optional[dict[str, Union[Sort, int]]] = None, **kwargs, ): """ Queries the Store for a single document. Args: criteria: PyMongo filter for documents to search properties: properties to return in the document sort: Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending. """ return self.multistore.query_one(self.store, criteria=criteria, properties=properties, sort=sort, **kwargs) def distinct(self, field: str, criteria: Optional[dict] = None, all_exist: bool = False, **kwargs) -> list: """ Get all distinct values for a field. Args: field: the field(s) to get distinct values for criteria: PyMongo filter for documents to search in """ return self.multistore.distinct(self.store, field=field, criteria=criteria, all_exist=all_exist, **kwargs) class MultiStore: """ A container for multiple maggma stores. When adding stores to a MultiStore, a check will be performed to see if the store (or a equivalent) already exists. If it does, it will not be added again. This enables the caching of Stores with the intent of pooling connections to make sure that the same connection is used when accessing as store many times (from different processes). Notes: 1) While this class implements the abstract methods of a Store, it is not a store. The additional `store` argument does not conform with the abstract base class. 2) The stores should not be directly accessed via MultiStore()._stores. The MultiStore must be used with the StoreFacade class, which is consistent with other Stores. An example of usage is as follows: ``` # Create the multistore multistore = MultiStore() # Add a store to the multistore first_store = MongoStore(..., collection_name="collection_one") multistore.ensure_store(first_store) multistore.count_stores() # Returns 1, since there is one store added # Add a second store to the multistore second_store = MongoStore(..., collection_name="collection_two") multistore.ensure_store(second_store) multistore.count_stores() # Returns 2, since there are two stores added # Attempt to add a duplicate store third_store = MongoStore(..., collection_name="collection_two") multistore.ensure_store(second_store) # The store will not be added since it already exists multistore.count_stores() # Returns 2 ``` """ def __init__(self, **kwargs): """ Initializes a MultiStore. """ # Keep a list of stores, since there is no way to hash a store (to use a dict) self._stores = [] self._multistore_lock = Lock() super().__init__(**kwargs) def get_store_index(self, store: Store) -> Optional[int]: """ Gets the index of the store in the list of stores. If it doesn't exist, returns None. Note: this is not a search for an instance of a store, but rather a search for a equivalent store Args: store: The store to find Returns: The index of the store in the internal list of cached stores """ # check host, port, name, and username for i, _store in enumerate(self._stores): if store == _store: return i return None def add_store(self, store: Store): """ Adds a store to the list of cached stores. Args: store: The store to cache Returns: True if the store was added or if it already exists """ # Check that the store is actually a store if not isinstance(store, Store): raise TypeError("store must be a Store") # We are writing to the _stores list, so a lock # must be used to ensure no simultaneous writes with self._multistore_lock: # Check if the store exists, just a double check # in case another process added it before this lock was acquired maybe_store_exists = self.get_store_index(store) if maybe_store_exists is None: # Make a new instance of it, so it doesn't get # modified outside of this process unintentionally self._stores.append(MontyDecoder().process_decoded(store.as_dict())) self._stores[-1].connect() return True # Store already exists, we don't need to add it return True def ensure_store(self, store: Store) -> bool: """ Tries to add the store to the list of cached stores and return true if it succeeded. Args: store: The store to cache Returns: bool indicating if the store exists/was created """ if self.get_store_index(store) is None: # Store doesn't exist here, we should add it return self.add_store(store) return True def count_stores(self) -> int: """ Returns the number of stores in the multistore. Returns: int indicating the number of stores """ return len(self._stores) # These are maggma stores attributes we must provide access to def store_collection(self, store): store_id = self.get_store_index(store) return self._stores[store_id]._collection def store_name(self, store) -> str: store_id = self.get_store_index(store) return self._stores[store_id].name def connect(self, store, force_reset: bool = False): """ For a given store, connect to the source data. Args: store: the store to connect to the source data force_reset: whether to reset the connection or not when the Store is already connected. """ with self._multistore_lock: store_id = self.get_store_index(store) self._stores[store_id].connect(force_reset) def close(self, store: Store): """ For a given store, close any connections. Args: store: the store to close connections to """ with self._multistore_lock: store_id = self.get_store_index(store) self._stores[store_id].close() def connect_all(self, force_reset: bool = False): """ Connects to all stores. Args: force_reset: whether to reset the connection or not when the Store is already connected. """ with self._multistore_lock: for store in self._stores: store.connect(force_reset) def close_all(self): """ Closes all connections. """ with self._multistore_lock: for store in self._stores: store.close() def count(self, store: Store, criteria: Optional[dict] = None, **kwargs) -> int: """ Counts the number of documents matching the query criteria. Args: criteria: PyMongo filter for documents to count in """ store_id = self.get_store_index(store) return self._stores[store_id].count(criteria=criteria, **kwargs) def query( self, store: Store, criteria: Optional[dict] = None, properties: Union[dict, list, None] = None, sort: Optional[dict[str, Union[Sort, int]]] = None, skip: int = 0, limit: int = 0, **kwargs, ) -> list[dict]: """ Queries the Store for a set of documents. Args: criteria: PyMongo filter for documents to search in properties: properties to return in grouped documents sort: Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending. skip: number documents to skip limit: limit on total number of documents returned """ store_id = self.get_store_index(store) # We must return a list, since a generator is not serializable return list( self._stores[store_id].query( criteria=criteria, properties=properties, sort=sort, skip=skip, limit=limit, **kwargs ) ) def update(self, store: Store, docs: Union[list[dict], dict], key: Union[list, str, None] = None, **kwargs): """ Update documents into the Store. Args: docs: the document or list of documents to update key: field name(s) to determine uniqueness for a document, can be a list of multiple fields, a single field, or None if the Store's key field is to be used """ store_id = self.get_store_index(store) return self._stores[store_id].update(docs=docs, key=key, **kwargs) def ensure_index(self, store: Store, key: str, unique: bool = False, **kwargs) -> bool: """ Tries to create an index and return true if it succeeded. Args: key: single key to index unique: Whether or not this index contains only unique keys Returns: bool indicating if the index exists/was created """ store_id = self.get_store_index(store) return self._stores[store_id].ensure_index(key=key, unique=unique, **kwargs) def groupby( self, store: Store, keys: Union[list[str], str], criteria: Optional[dict] = None, properties: Union[dict, list, None] = None, sort: Optional[dict[str, Union[Sort, int]]] = None, skip: int = 0, limit: int = 0, **kwargs, ) -> Iterator[tuple[dict, list[dict]]]: """ Simple grouping function that will group documents by keys. Args: keys: fields to group documents criteria: PyMongo filter for documents to search in properties: properties to return in grouped documents sort: Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending. skip: number documents to skip limit: limit on total number of documents returned Returns: generator returning tuples of (dict, list of docs) """ store_id = self.get_store_index(store) return self._stores[store_id].groupby( keys=keys, criteria=criteria, properties=properties, sort=sort, skip=skip, limit=limit, **kwargs ) def remove_docs(self, store: Store, criteria: dict, **kwargs): """ Remove docs matching the query dictionary. Args: criteria: query dictionary to match """ store_id = self.get_store_index(store) return self._stores[store_id].remove_docs(criteria=criteria, **kwargs) def query_one( self, store: Store, criteria: Optional[dict] = None, properties: Union[dict, list, None] = None, sort: Optional[dict[str, Union[Sort, int]]] = None, **kwargs, ): """ Queries the Store for a single document. Args: criteria: PyMongo filter for documents to search properties: properties to return in the document sort: Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending. """ store_id = self.get_store_index(store) return next( self._stores[store_id].query(criteria=criteria, properties=properties, sort=sort, **kwargs), None, ) def distinct( self, store: Store, field: str, criteria: Optional[dict] = None, all_exist: bool = False, **kwargs ) -> list: """ Get all distinct values for a field. Args: field: the field(s) to get distinct values for criteria: PyMongo filter for documents to search in """ store_id = self.get_store_index(store) return self._stores[store_id].distinct(field=field, criteria=criteria, all_exist=all_exist, **kwargs) def set_store_attribute(self, store: Store, name: str, value: Any): """ A method to set an attribute of a store. Args: name: The name of a function or attribute to access store: The store to access the attribute of value: New value of the attribute """ store_id = self.get_store_index(store) setattr(self._stores[store_id], name, value) def call_attr(self, name: str, store: Store, **kwargs): """ This class will actually call an attribute/method on the class instance. Args: name: The name of a function or attribute to access store: The store to access the attribute of Returns: The result of the attribute or function call """ store_id = self.get_store_index(store) return getattr(self._stores[store_id], name)(**kwargs) def _proxy_attribute(self, name: str, store) -> Union[Any, Callable]: """ This function will take care of the StoreFacade accessing attributes or functions of the store that are not required by the Store abstract class. Args: name: The name of a function or attribute to access store: The store to access the attribute of Returns: The attribute or a partial function which gives access to the attribute """ store_id = self.get_store_index(store) maybe_fn = getattr(self._stores[store_id], name) if callable(maybe_fn): return partial(self.call_attr, name=name, store=store) return maybe_fn class MultiStoreManager(BaseManager): """ Provide a server that can host shared objects between multiprocessing Processes (that normally can't share data). For example, a common MultiStore is shared between processes and access is coordinated to limit DB hits. # Adapted from fireworks/utilities/fw_utilities.py """ @classmethod def setup(cls, multistore): """ Args: multistore: A multistore to share between processes. Returns: A manager """ MultiStoreManager.register("MultiStore", callable=lambda: multistore) m = MultiStoreManager(address=("127.0.0.1", 0), authkey=b"abcd") m.start() return m maggma-0.70.0/src/maggma/stores/ssh_tunnel.py000066400000000000000000000064141470132070100211220ustar00rootroot00000000000000from socket import socket from typing import Optional from monty.json import MSONable from sshtunnel import SSHTunnelForwarder class SSHTunnel(MSONable): """SSH tunnel to remote server.""" __TUNNELS: dict[str, SSHTunnelForwarder] = {} def __init__( self, tunnel_server_address: str, remote_server_address: str, local_port: Optional[int] = None, username: Optional[str] = None, password: Optional[str] = None, private_key: Optional[str] = None, **kwargs, ): """ Args: tunnel_server_address: string address with port for the SSH tunnel server remote_server_address: string address with port for the server to connect to local_port: optional port to use for the local address (127.0.0.1); if `None`, a random open port will be automatically selected username: optional username for the ssh tunnel server password: optional password for the ssh tunnel server; If a private_key is supplied this password is assumed to be the private key password private_key: ssh private key to authenticate to the tunnel server kwargs: any extra args passed to the SSHTunnelForwarder. """ self.tunnel_server_address = tunnel_server_address self.remote_server_address = remote_server_address self.local_port = local_port self.username = username self.password = password self.private_key = private_key self.kwargs = kwargs if remote_server_address in SSHTunnel.__TUNNELS: self.tunnel = SSHTunnel.__TUNNELS[remote_server_address] else: if local_port is None: local_port = _find_free_port("127.0.0.1") local_bind_address = ("127.0.0.1", local_port) ssh_address, ssh_port = tunnel_server_address.split(":") ssh_port = int(ssh_port) # type: ignore remote_bind_address, remote_bind_port = remote_server_address.split(":") remote_bind_port = int(remote_bind_port) # type: ignore if private_key is not None: ssh_password = None ssh_private_key_password = password else: ssh_password = password ssh_private_key_password = None self.tunnel = SSHTunnelForwarder( ssh_address_or_host=(ssh_address, ssh_port), local_bind_address=local_bind_address, remote_bind_address=(remote_bind_address, remote_bind_port), ssh_username=username, ssh_password=ssh_password, ssh_private_key_password=ssh_private_key_password, ssh_pkey=private_key, **kwargs, ) def start(self): if not self.tunnel.is_active: self.tunnel.start() def stop(self): if self.tunnel.tunnel_is_up: self.tunnel.stop() @property def local_address(self) -> tuple[str, int]: return self.tunnel.local_bind_address def _find_free_port(address="0.0.0.0"): s = socket() s.bind((address, 0)) # Bind to a free port provided by the host. return s.getsockname()[1] # Return the port number assigned. maggma-0.70.0/src/maggma/utils.py000066400000000000000000000201431470132070100165540ustar00rootroot00000000000000""" Utilities to help with maggma functions. """ import itertools import logging import signal import uuid from collections.abc import Iterable from datetime import datetime, timedelta from importlib import import_module from typing import Optional, Union from bson.json_util import ObjectId from dateutil import parser from pydantic import BaseModel from pydantic._internal._utils import lenient_issubclass from pydash.objects import get, has, set_ from pydash.objects import unset as _unset from pydash.utilities import to_path from pymongo.collection import Collection # import tqdm Jupyter widget if running inside Jupyter from tqdm.auto import tqdm def primed(iterable: Iterable) -> Iterable: """Preprimes an iterator so the first value is calculated immediately but not returned until the first iteration. """ itr = iter(iterable) try: first = next(itr) # itr.next() in Python 2 except StopIteration: return itr return itertools.chain([first], itr) class TqdmLoggingHandler(logging.Handler): """ Helper to enable routing tqdm progress around logging. """ def __init__(self, level=logging.NOTSET): """ Initialize the Tqdm handler. """ super().__init__(level) def emit(self, record): """ Emit a record via Tqdm screen. """ try: msg = self.format(record) tqdm.write(msg) self.flush() except (KeyboardInterrupt, SystemExit): raise except Exception: self.handleError(record) def confirm_field_index(collection: Collection, field: str) -> bool: """Confirm index on store for at least one of fields. One can't simply ensure an index exists via `store.collection.create_index` because a Builder must assume read-only access to source Stores. The MongoDB `read` built-in role does not include the `createIndex` action. Returns: True if an index exists for a given field False if not """ info = list(collection.index_information().values()) keys = {spec[0] for index in info for spec in index["key"]} return field in keys def to_isoformat_ceil_ms(dt: Union[datetime, str]) -> str: """Helper to account for Mongo storing datetimes with only ms precision.""" if isinstance(dt, datetime): return (dt + timedelta(milliseconds=1)).isoformat(timespec="milliseconds") if isinstance(dt, str): return dt return None def to_dt(s: Union[datetime, str]) -> datetime: """Convert an ISO 8601 string to a datetime.""" if isinstance(s, str): return parser.parse(s) if isinstance(s, datetime): return s return None # This lu_key prioritizes not duplicating potentially expensive item # processing on incremental rebuilds at the expense of potentially missing a # source document updated within 1 ms of a builder get_items call. Ensure # appropriate builder validation. LU_KEY_ISOFORMAT = (to_dt, to_isoformat_ceil_ms) def recursive_update(d: dict, u: dict): """ Recursive updates d with values from u. Args: d (dict): dict to update u (dict): updates to propagate """ for k, v in u.items(): if k in d: if isinstance(v, dict) and isinstance(d[k], dict): recursive_update(d[k], v) else: d[k] = v else: d[k] = v def grouper(iterable: Iterable, n: int) -> Iterable: """ Collect data into fixed-length chunks or blocks. >>> list(grouper(3, 'ABCDEFG')) [['A', 'B', 'C'], ['D', 'E', 'F'], ['G']]. Updated from: https://stackoverflow.com/questions/31164731/python-chunking-csv-file-multiproccessing/31170795#31170795 """ iterable = iter(iterable) return iter(lambda: list(itertools.islice(iterable, n)), []) def lazy_substitute(d: dict, aliases: dict): """ Simple top level substitute that doesn't dive into mongo like strings. """ for alias, key in aliases.items(): if key in d: d[alias] = d[key] del d[key] def substitute(d: dict, aliases: dict): """ Substitutes keys in dictionary Accepts multilevel mongo like keys. """ for alias, key in aliases.items(): if has(d, key): set_(d, alias, get(d, key)) unset(d, key) def unset(d: dict, key: str): """ Unsets a key. """ _unset(d, key) path = to_path(key) for i in reversed(range(1, len(path))): if len(get(d, path[:i])) == 0: unset(d, path[:i]) class Timeout: """ Context manager that provides context. implementation courtesy of https://stackoverflow.com/a/22348885/637562. """ def __init__(self, seconds=14, error_message=""): """ Set a maximum running time for functions. :param seconds (int): Seconds before TimeoutError raised, set to None to disable, default is set assuming a maximum running time of 1 day for 100,000 items parallelized across 16 cores, i.e. int(16 * 24 * 60 * 60 / 1e5) :param error_message (str): Error message to display with TimeoutError """ self.seconds = int(seconds) if seconds else None self.error_message = error_message def handle_timeout(self, signum, frame): """ Raises an error on timeout. """ raise TimeoutError(self.error_message) def __enter__(self): """ Enter context with timeout. """ if self.seconds: signal.signal(signal.SIGALRM, self.handle_timeout) signal.alarm(self.seconds) def __exit__(self, type, value, traceback): """ Exit context with timeout. """ if self.seconds: signal.alarm(0) def dynamic_import(abs_module_path: str, class_name: Optional[str] = None): """ Dynamic class importer from: https://www.bnmetrics.com/blog/dynamic-import-in-python3. """ if class_name is None: class_name = abs_module_path.split(".")[-1] abs_module_path = ".".join(abs_module_path.split(".")[:-1]) module_object = import_module(abs_module_path) return getattr(module_object, class_name) class ReportingHandler(logging.Handler): """ Helper to route reporting messages This uses the NOTSET level to send reporting messages. """ def __init__(self, reporting_store): """ Initialize the Reporting Logger. """ super().__init__(logging.NOTSET) self.reporting_store = reporting_store self.reporting_store.connect() self.errors = 0 self.warnings = 0 self.build_id = uuid.uuid4() def emit(self, record): """ Emit a record via Tqdm screen. """ if "maggma" in record.__dict__: maggma_record = record.maggma event = maggma_record["event"] maggma_record.update( { "last_updated": datetime.utcnow(), "machine": uuid.UUID(int=uuid.getnode()), } ) if event == "BUILD_STARTED": self.errors = 0 self.warnings = 0 self.build_id = uuid.uuid4() elif event == "BUILD_ENDED": maggma_record.update({"errors": self.errors, "warnings": self.warnings}) maggma_record["_id"] = ObjectId() maggma_record["build_id"] = self.build_id self.reporting_store.update(maggma_record, key="_id") def get_flat_models_from_model(model: BaseModel, known_models: Optional[set[BaseModel]] = None): """Get all sub-models from a pydantic model. Args: model (BaseModel): Pydantic model known_models (set, optional): Set with known models. Defaults to set(). Returns: (set[BaseModel]): Set of pydantic models """ known_models = known_models or set() known_models.add(model) for field_info in model.model_fields.values(): field_type = field_info.annotation if lenient_issubclass(field_type, BaseModel): get_flat_models_from_model(field_type, known_models) return known_models maggma-0.70.0/src/maggma/validators.py000066400000000000000000000063211470132070100175660ustar00rootroot00000000000000""" Validator class for document-level validation on Stores. Attach an instance of a Validator subclass to a Store .schema variable to enable validation on that Store. """ from jsonschema import ValidationError, validate from jsonschema.validators import validator_for from maggma.core import Validator class JSONSchemaValidator(Validator): """ A validator that allows document validation against a provided JSON schema. For convenience, the helper method in this module `msonable_schema` can be used to create a schema for a specific MSONable object, which can be embedded in your JSON schema. See the tests for an example of this. """ def __init__(self, schema: dict, strict: bool = False): """ Args: strict: Informs Store how to treat Validator: if True, will cause build to fail if invalid document is found and raise a ValueError, if False will continue build but log an error message. In both cases, invalid documents will not be stored. schema: A Python dict representation of a JSON. """ self._schema = schema self._strict = strict @property def strict(self) -> bool: """ Whether is_valid() should raise a ValidationError or simply return False if a document fails validation. """ return self._strict @property def schema(self) -> dict: """ Defines a JSON schema for your document, which is used by the default `validate_doc()` method. This is a standard, with many validators existing, including MongoDB's own JSON schema validator (3.6+). Implementing this property allows both the document to be validated by a Builder, and also makes it possible to enable document-level validation inside MongoDB for Mongo-backed Stores. """ return self._schema def is_valid(self, doc: dict) -> bool: """ Returns True or False if validator initialized with strict=False, or returns True or raises ValidationError if strict=True. Args: doc (dict): a single document """ try: validate(doc, schema=self.schema) return True except ValidationError: if self.strict: raise return False def validation_errors(self, doc: dict) -> list[str]: """ If document is not valid, provides a list of strings to display for why validation has failed. Returns empty list if the document is valid Args: doc - document to check """ if self.is_valid(doc): return [] validator = validator_for(self.schema)(self.schema) return ["{}: {}".format(".".join(error.absolute_path), error.message) for error in validator.iter_errors(doc)] def msonable_schema(cls): """ Convenience function to return a JSON Schema for any MSONable class. """ return { "type": "object", "required": ["@class", "@module"], "properties": { "@class": {"const": cls.__name__}, "@module": {"const": cls.__module__}, }, } maggma-0.70.0/tests/000077500000000000000000000000001470132070100141645ustar00rootroot00000000000000maggma-0.70.0/tests/__init__.py000066400000000000000000000000001470132070100162630ustar00rootroot00000000000000maggma-0.70.0/tests/api/000077500000000000000000000000001470132070100147355ustar00rootroot00000000000000maggma-0.70.0/tests/api/__init__.py000066400000000000000000000000001470132070100170340ustar00rootroot00000000000000maggma-0.70.0/tests/api/test_aggregation_resource.py000066400000000000000000000044721470132070100225530ustar00rootroot00000000000000from datetime import datetime from random import randint import pytest from fastapi import FastAPI from pydantic import BaseModel, Field from starlette.testclient import TestClient from maggma.api.query_operator.core import QueryOperator from maggma.api.resource import AggregationResource from maggma.stores import MemoryStore class Owner(BaseModel): name: str = Field(..., title="Owner's name") age: int = Field(None, title="Owne'r Age") weight: float = Field(None, title="Owner's weight") last_updated: datetime = Field(None, title="Last updated date for this record") owners = ( [Owner(name=f"Person{i}", age=i + 3, weight=100 + i) for i in list(range(10))] + [Owner(name="PersonAge9", age=9, weight=float(randint(155, 195)))] + [Owner(name="PersonWeight150", age=randint(10, 15), weight=float(150))] + [Owner(name="PersonAge20Weight200", age=20, weight=float(200))] ) total_owners = len(owners) @pytest.fixture() def owner_store(): store = MemoryStore("owners", key="name") store.connect() store.update([d.dict() for d in owners]) return store @pytest.fixture() def pipeline_query_op(): class PipelineQuery(QueryOperator): def query(self): pipeline = [ {"$match": {"name": "PersonAge9"}}, {"$project": {"age": 1}}, ] return {"pipeline": pipeline} return PipelineQuery() def test_init(owner_store, pipeline_query_op): resource = AggregationResource(store=owner_store, pipeline_query_operator=pipeline_query_op, model=Owner) assert len(resource.router.routes) == 2 def test_msonable(owner_store, pipeline_query_op): owner_resource = AggregationResource(store=owner_store, pipeline_query_operator=pipeline_query_op, model=Owner) endpoint_dict = owner_resource.as_dict() for k in ["@class", "@module", "store", "model"]: assert k in endpoint_dict assert isinstance(endpoint_dict["model"], str) assert endpoint_dict["model"] == "tests.api.test_aggregation_resource.Owner" def test_aggregation_search(owner_store, pipeline_query_op): endpoint = AggregationResource(owner_store, pipeline_query_operator=pipeline_query_op, model=Owner) app = FastAPI() app.include_router(endpoint.router) client = TestClient(app) assert client.get("/").status_code == 200 maggma-0.70.0/tests/api/test_api.py000066400000000000000000000104021470132070100171140ustar00rootroot00000000000000import json from enum import Enum from random import choice, randint from typing import Any from urllib.parse import urlencode import pytest from fastapi.encoders import jsonable_encoder from pydantic import BaseModel, Field from requests import Response from starlette.testclient import TestClient from maggma.api.API import API from maggma.api.query_operator import NumericQuery, PaginationQuery, SparseFieldsQuery, StringQueryOperator from maggma.api.resource import ReadOnlyResource from maggma.stores import MemoryStore class PetType(str, Enum): cat = "cat" dog = "dog" class Owner(BaseModel): name: str = Field(..., title="Owner's name") age: int = Field(..., title="Owne'r Age") weight: int = Field(..., title="Owner's weight") class Pet(BaseModel): name: str = Field(..., title="Pet's Name") pet_type: PetType = Field(..., title="Pet Type") owner_name: str = Field(..., title="Owner's name") owners = [Owner(name=f"Person{i}", age=randint(10, 100), weight=randint(100, 200)) for i in list(range(10))] pets = [ Pet( name=f"Pet{i}", pet_type=choice(list(PetType)), owner_name=choice(owners).name, ) for i in list(range(40)) ] @pytest.fixture() def owner_store(): store = MemoryStore("owners", key="name") store.connect() store.update([jsonable_encoder(d) for d in owners]) return store @pytest.fixture() def pet_store(): store = MemoryStore("pets", key="name") store.connect() store.update([jsonable_encoder(d) for d in pets]) return store def test_msonable(owner_store, pet_store): owner_endpoint = ReadOnlyResource(owner_store, Owner) pet_endpoint = ReadOnlyResource(pet_store, Pet) manager = API({"owners": owner_endpoint, "pets": pet_endpoint}) api_dict = manager.as_dict() for k in ["@class", "@module", "resources"]: assert k in api_dict def search_helper(payload, base: str = "/?", debug=True) -> tuple[Response, Any]: """ Helper function to directly query search endpoints Args: store: store f base: base of the query, default to /query? client: TestClient generated from FastAPI payload: query in dictionary format debug: True = print out the url, false don't print anything Returns: request.Response object that contains the response of the corresponding payload """ owner_store = MemoryStore("owners", key="name") owner_store.connect() owner_store.update([d.model_dump() for d in owners]) pets_store = MemoryStore("pets", key="name") pets_store.connect() pets_store.update([jsonable_encoder(d) for d in pets]) resources = { "owners": [ ReadOnlyResource( owner_store, Owner, query_operators=[ StringQueryOperator(model=Owner), # type: ignore NumericQuery(model=Owner), # type: ignore SparseFieldsQuery(model=Owner), PaginationQuery(), ], ) ], "pets": [ ReadOnlyResource( pets_store, Owner, query_operators=[ StringQueryOperator(model=Pet), NumericQuery(model=Pet), SparseFieldsQuery(model=Pet), PaginationQuery(), ], ) ], } api = API(resources=resources) client = TestClient(api.app) url = base + urlencode(payload) if debug: print(url) res = client.get(url) try: data = res.json().get("data", []) except json.decoder.JSONDecodeError: data = res.text return res, data def test_cluster_run(owner_store, pet_store): res, data = search_helper(payload="") assert res.status_code == 200 payload = {"name": "Person1", "_limit": 10, "_all_fields": True} res, data = search_helper(payload=payload, base="/owners/?") assert res.status_code == 200 assert len(data) == 1 assert data[0]["name"] == "Person1" payload = {"name": "Pet1", "_limit": 10, "_all_fields": True} res, data = search_helper(payload=payload, base="/pets/?") assert res.status_code == 200 assert len(data) == 1 assert data[0]["name"] == "Pet1" maggma-0.70.0/tests/api/test_post_resource.py000066400000000000000000000037571470132070100212560ustar00rootroot00000000000000from datetime import datetime from random import randint import pytest from fastapi import FastAPI from pydantic import BaseModel, Field from starlette.testclient import TestClient from maggma.api.resource import PostOnlyResource from maggma.stores import MemoryStore class Owner(BaseModel): name: str = Field(..., title="Owner's name") age: int = Field(None, title="Owne'r Age") weight: float = Field(None, title="Owner's weight") last_updated: datetime = Field(None, title="Last updated date for this record") owners = ( [Owner(name=f"Person{i}", age=i + 3, weight=100 + i) for i in list(range(10))] + [Owner(name="PersonAge9", age=9, weight=float(randint(155, 195)))] + [Owner(name="PersonWeight150", age=randint(10, 15), weight=float(150))] + [Owner(name="PersonAge20Weight200", age=20, weight=float(200))] ) total_owners = len(owners) @pytest.fixture() def owner_store(): store = MemoryStore("owners", key="name") store.connect() store.update([d.dict() for d in owners]) return store def test_init(owner_store): resource = PostOnlyResource(store=owner_store, model=Owner) assert len(resource.router.routes) == 2 def test_msonable(owner_store): owner_resource = PostOnlyResource(store=owner_store, model=Owner) endpoint_dict = owner_resource.as_dict() for k in ["@class", "@module", "store", "model"]: assert k in endpoint_dict assert isinstance(endpoint_dict["model"], str) assert endpoint_dict["model"] == "tests.api.test_post_resource.Owner" def test_post_to_search(owner_store): endpoint = PostOnlyResource(owner_store, Owner) app = FastAPI() app.include_router(endpoint.router) client = TestClient(app) assert client.post("/").status_code == 200 @pytest.mark.xfail() def test_problem_query_params(owner_store): endpoint = PostOnlyResource(owner_store, Owner) app = FastAPI() app.include_router(endpoint.router) client = TestClient(app) client.get("/?param=test").status_code maggma-0.70.0/tests/api/test_query_operators.py000066400000000000000000000074641470132070100216240ustar00rootroot00000000000000from datetime import datetime from enum import Enum import pytest from fastapi import HTTPException from monty.serialization import dumpfn, loadfn from monty.tempfile import ScratchDir from pydantic import BaseModel, Field from maggma.api.query_operator import NumericQuery, PaginationQuery, SortQuery, SparseFieldsQuery from maggma.api.query_operator.submission import SubmissionQuery class Owner(BaseModel): name: str = Field(..., title="Owner's name") age: int = Field(None, title="Owne'r Age") weight: float = Field(None, title="Owner's weight") last_updated: datetime = Field(None, title="Last updated date for this record") def test_pagination_functionality(): op = PaginationQuery() assert op.query(_skip=10, _limit=20, _page=None, _per_page=None) == { "limit": 20, "skip": 10, } assert op.query(_skip=None, _limit=None, _page=3, _per_page=23) == { "limit": 23, "skip": 46, } with pytest.raises(HTTPException): op.query(_limit=10000, _skip=100, _page=None, _per_page=None) with pytest.raises(HTTPException): op.query(_limit=None, _skip=None, _page=5, _per_page=10000) with pytest.raises(HTTPException): op.query(_limit=-1, _skip=100, _page=None, _per_page=None) with pytest.raises(HTTPException): op.query(_page=-1, _per_page=100, _skip=None, _limit=None) def test_pagination_serialization(): op = PaginationQuery() with ScratchDir("."): dumpfn(op, "temp.json") new_op = loadfn("temp.json") assert new_op.query(_skip=10, _limit=20, _page=None, _per_page=None) == { "limit": 20, "skip": 10, } def test_sparse_query_functionality(): op = SparseFieldsQuery(model=Owner) assert op.meta()["default_fields"] == ["name", "age", "weight", "last_updated"] assert op.query() == {"properties": ["name", "age", "weight", "last_updated"]} def test_sparse_query_serialization(): op = SparseFieldsQuery(model=Owner) with ScratchDir("."): dumpfn(op, "temp.json") new_op = loadfn("temp.json") assert new_op.query() == {"properties": ["name", "age", "weight", "last_updated"]} def test_numeric_query_functionality(): op = NumericQuery(model=Owner) assert op.meta() == {} assert op.query(age_max=10, age_min=1, age_not_eq=[2, 3], weight_min=120) == { "criteria": { "age": {"$lte": 10, "$gte": 1, "$ne": [2, 3]}, "weight": {"$gte": 120}, } } def test_numeric_query_serialization(): op = NumericQuery(model=Owner) with ScratchDir("."): dumpfn(op, "temp.json") new_op = loadfn("temp.json") assert new_op.query(age_max=10) == {"criteria": {"age": {"$lte": 10}}} def test_sort_query_functionality(): op = SortQuery() assert op.query(_sort_fields="volume,-density") == {"sort": {"volume": 1, "density": -1}} def test_sort_query_fail(): op = SortQuery(max_num=1) with pytest.raises(HTTPException): op.query(_sort_fields="volume,-density") def test_sort_serialization(): op = SortQuery() with ScratchDir("."): dumpfn(op, "temp.json") new_op = loadfn("temp.json") assert new_op.query(_sort_fields="volume,-density") == {"sort": {"volume": 1, "density": -1}} @pytest.fixture() def status_enum(): class StatusEnum(Enum): state_A = "A" state_B = "B" return StatusEnum def test_submission_functionality(status_enum): op = SubmissionQuery(status_enum) dt = datetime.utcnow() assert op.query(state=status_enum.state_A, last_updated=dt) == { "criteria": { "$and": [ {"$expr": {"$eq": [{"$arrayElemAt": ["$state", -1]}, "A"]}}, {"$expr": {"$gt": [{"$arrayElemAt": ["$last_updated", -1]}, dt]}}, ] } } maggma-0.70.0/tests/api/test_read_resource.py000066400000000000000000000166171470132070100212030ustar00rootroot00000000000000import inspect from datetime import datetime from random import randint from urllib.parse import urlencode import pytest from fastapi import FastAPI from pydantic import BaseModel, Field from requests import Response from starlette.testclient import TestClient from maggma.api.query_operator import NumericQuery, SparseFieldsQuery, StringQueryOperator from maggma.api.resource import ReadOnlyResource from maggma.api.resource.core import HeaderProcessor, HintScheme from maggma.stores import AliasingStore, MemoryStore class Owner(BaseModel): name: str = Field(..., title="Owner's name") age: int = Field(None, title="Owne'r Age") weight: float = Field(None, title="Owner's weight") last_updated: datetime = Field(None, title="Last updated date for this record") owners = ( [Owner(name=f"Person{i}", age=i + 3, weight=100 + i) for i in list(range(10))] + [Owner(name="PersonAge9", age=9, weight=float(randint(155, 195)))] + [Owner(name="PersonWeight150", age=randint(10, 15), weight=float(150))] + [Owner(name="PersonAge20Weight200", age=20, weight=float(200))] ) total_owners = len(owners) # Create a subclass of the header processor to prevent TypeErrors: # Can't instantiate abstract class HeaderProcessor with abstract methods class TestHeaderProcessor(HeaderProcessor): def configure_query_on_request(self, request, query_operator): # Implement the method return {"name": "PersonAge9"} def process_header(self, response, request): # Implement the method pass @pytest.fixture() def owner_store(): store = MemoryStore("owners", key="name") store.connect() store.update([d.dict() for d in owners]) return store def test_init(owner_store): resource = ReadOnlyResource(store=owner_store, model=Owner, enable_get_by_key=True) assert len(resource.router.routes) == 3 resource = ReadOnlyResource(store=owner_store, model=Owner, enable_get_by_key=False) assert len(resource.router.routes) == 2 resource = ReadOnlyResource(store=owner_store, model=Owner, enable_default_search=False, enable_get_by_key=True) assert len(resource.router.routes) == 2 def test_msonable(owner_store): owner_resource = ReadOnlyResource(store=owner_store, model=Owner) endpoint_dict = owner_resource.as_dict() for k in ["@class", "@module", "store", "model"]: assert k in endpoint_dict assert isinstance(endpoint_dict["model"], str) assert endpoint_dict["model"] == "tests.api.test_read_resource.Owner" def test_get_by_key(owner_store): endpoint = ReadOnlyResource(owner_store, Owner, disable_validation=True, enable_get_by_key=True) app = FastAPI() app.include_router(endpoint.router) client = TestClient(app) assert client.get("/").status_code == 200 assert client.get("/Person1/").status_code == 200 assert client.get("/Person1/").json()["data"][0]["name"] == "Person1" def test_key_fields(owner_store): endpoint = ReadOnlyResource(owner_store, Owner, key_fields=["name"], enable_get_by_key=True) app = FastAPI() app.include_router(endpoint.router) client = TestClient(app) assert client.get("/Person1/").status_code == 200 assert client.get("/Person1/").json()["data"][0]["name"] == "Person1" @pytest.mark.xfail() def test_problem_query_params(owner_store): endpoint = ReadOnlyResource(owner_store, Owner) app = FastAPI() app.include_router(endpoint.router) client = TestClient(app) client.get("/?param=test").status_code @pytest.mark.xfail() def test_problem_hint_scheme(owner_store): class TestHintScheme(HintScheme): def generate_hints(query): return {"hint": "test"} test_store = AliasingStore(owner_store, {"owners": "test"}, key="name") ReadOnlyResource(test_store, Owner, hint_scheme=TestHintScheme()) def search_helper(payload, base: str = "/?", debug=True) -> Response: """ Helper function to directly query search endpoints. Args: store: store f base: base of the query, default to /query? client: TestClient generated from FastAPI payload: query in dictionary format debug: True = print out the url, false don't print anything Returns: request.Response object that contains the response of the corresponding payload """ store = MemoryStore("owners", key="name") store.connect() store.update([d.dict() for d in owners]) endpoint = ReadOnlyResource( store, Owner, query_operators=[ StringQueryOperator(model=Owner), NumericQuery(model=Owner), SparseFieldsQuery(model=Owner), ], header_processor=TestHeaderProcessor(), query_to_configure_on_request=StringQueryOperator(model=Owner), disable_validation=True, ) app = FastAPI() app.include_router(endpoint.router) client = TestClient(app) print(inspect.signature(NumericQuery(model=Owner).query)) url = base + urlencode(payload) if debug: print(url) res = client.get(url) json = res.json() return res, json.get("data", []) # type: ignore def test_numeric_query_operator(): # Checking int payload = {"age": 20, "_all_fields": True} res, data = search_helper(payload=payload, base="/?", debug=True) assert res.status_code == 200 assert len(data) == 1 assert data[0]["age"] == 20 payload = {"age_not_eq": 9, "_all_fields": True} res, data = search_helper(payload=payload, base="/?", debug=True) assert res.status_code == 200 assert len(data) == 11 payload = {"age_max": 9} res, data = search_helper(payload=payload, base="/?", debug=True) assert res.status_code == 200 assert len(data) == 8 payload = {"age_min": 0} res, data = search_helper(payload=payload, base="/?", debug=True) assert res.status_code == 200 assert len(data) == 13 def test_string_query_operator(): payload = {"name": "PersonAge9", "_all_fields": True} res, data = search_helper(payload=payload, base="/?", debug=True) assert res.status_code == 200 assert len(data) == 1 assert data[0]["name"] == "PersonAge9" payload = {"name_not_eq": "PersonAge9", "_all_fields": True} res, data = search_helper(payload=payload, base="/?", debug=True) assert res.status_code == 200 assert len(data) == 12 def test_resource_compound(): payload = { "name": "PersonAge20Weight200", "_all_fields": True, "weight_min": 199.1, "weight_max": 201.4, "age": 20, } res, data = search_helper(payload=payload, base="/?", debug=True) assert res.status_code == 200 assert len(data) == 1 assert data[0]["name"] == "PersonAge20Weight200" payload = { "name": "PersonAge20Weight200", "_all_fields": False, "_fields": "name,age", "weight_min": 199.3, "weight_max": 201.9, "age": 20, } res, data = search_helper(payload=payload, base="/?", debug=True) assert res.status_code == 200 assert len(data) == 1 assert data[0]["name"] == "PersonAge20Weight200" assert "weight" not in data[0] def test_configure_query_on_request(): payload = { "name": "PersonAge20Weight200", "_all_fields": False, "_fields": "name,age", "weight_min": 199.3, "weight_max": 201.9, "age": 20, } res, data = search_helper(payload=payload, base="/?", debug=True) assert res.status_code == 200 maggma-0.70.0/tests/api/test_s3_url_resource.py000066400000000000000000000013151470132070100214640ustar00rootroot00000000000000import pytest from maggma.api.resource import S3URLResource from maggma.stores import MemoryStore @pytest.fixture() def entries_store(): store = MemoryStore("entries", key="url") store.connect() return store def test_init(entries_store): resource = S3URLResource(store=entries_store, url_lifetime=500) assert len(resource.router.routes) == 2 def test_msonable(entries_store): resource = S3URLResource(store=entries_store, url_lifetime=500) endpoint_dict = resource.as_dict() for k in ["@class", "@module", "store", "model"]: assert k in endpoint_dict assert isinstance(endpoint_dict["model"], str) assert endpoint_dict["model"] == "maggma.api.models.S3URLDoc" maggma-0.70.0/tests/api/test_submission_resource.py000066400000000000000000000106321470132070100224520ustar00rootroot00000000000000import json from datetime import datetime from random import randint import pytest from fastapi import FastAPI from pydantic import BaseModel, Field from starlette.testclient import TestClient from maggma.api.query_operator import PaginationQuery from maggma.api.query_operator.core import QueryOperator from maggma.api.resource import SubmissionResource from maggma.stores import MemoryStore class Owner(BaseModel): name: str = Field(..., title="Owner's name") age: int = Field(None, title="Owne'r Age") weight: float = Field(None, title="Owner's weight") last_updated: datetime = Field(None, title="Last updated date for this record") owners = ( [Owner(name=f"Person{i}", age=i + 3, weight=100 + i) for i in list(range(10))] + [Owner(name="PersonAge9", age=9, weight=float(randint(155, 195)))] + [Owner(name="PersonWeight150", age=randint(10, 15), weight=float(150))] + [Owner(name="PersonAge20Weight200", age=20, weight=float(200))] ) total_owners = len(owners) @pytest.fixture() def owner_store(): store = MemoryStore("owners", key="name") store.connect() store.update([d.dict() for d in owners]) return store @pytest.fixture() def post_query_op(): class PostQuery(QueryOperator): def query(self, name): return {"criteria": {"name": name}} return PostQuery() @pytest.fixture() def patch_query_op(): class PatchQuery(QueryOperator): def query(self, name, update): return {"criteria": {"name": name}, "update": update} return PatchQuery() def test_init(owner_store, post_query_op, patch_query_op): resource = SubmissionResource( store=owner_store, get_query_operators=[PaginationQuery()], post_query_operators=[post_query_op], patch_query_operators=[patch_query_op], model=Owner, ) assert len(resource.router.routes) == 5 def test_msonable(owner_store, post_query_op): owner_resource = SubmissionResource( store=owner_store, get_query_operators=[PaginationQuery()], post_query_operators=[post_query_op], model=Owner, ) endpoint_dict = owner_resource.as_dict() for k in ["@class", "@module", "store", "model"]: assert k in endpoint_dict assert isinstance(endpoint_dict["model"], str) assert endpoint_dict["model"] == "tests.api.test_submission_resource.Owner" def test_submission_search(owner_store, post_query_op): endpoint = SubmissionResource( store=owner_store, get_query_operators=[PaginationQuery()], post_query_operators=[post_query_op], calculate_submission_id=True, model=Owner, ) app = FastAPI() app.include_router(endpoint.router) client = TestClient(app) assert client.get("/").status_code == 200 assert client.post("/?name=test_name").status_code == 200 def test_submission_patch(owner_store, post_query_op, patch_query_op): endpoint = SubmissionResource( store=owner_store, get_query_operators=[PaginationQuery()], post_query_operators=[post_query_op], patch_query_operators=[patch_query_op], calculate_submission_id=True, model=Owner, ) app = FastAPI() app.include_router(endpoint.router) client = TestClient(app) update = json.dumps({"last_updated": "2023-06-22T17:32:11.645713"}) assert client.get("/").status_code == 200 assert client.patch(f"/?name=PersonAge9&update={update}").status_code == 200 def test_key_fields(owner_store, post_query_op): endpoint = SubmissionResource( store=owner_store, get_query_operators=[PaginationQuery()], post_query_operators=[post_query_op], calculate_submission_id=False, model=Owner, ) app = FastAPI() app.include_router(endpoint.router) client = TestClient(app) assert client.get("/Person1/").status_code == 200 assert client.get("/Person1/").json()["data"][0]["name"] == "Person1" def test_patch_submission(owner_store, post_query_op): endpoint = SubmissionResource( store=owner_store, get_query_operators=[PaginationQuery()], post_query_operators=[post_query_op], calculate_submission_id=False, model=Owner, ) app = FastAPI() app.include_router(endpoint.router) client = TestClient(app) assert client.get("/Person1/").status_code == 200 assert client.get("/Person1/").json()["data"][0]["name"] == "Person1" maggma-0.70.0/tests/api/test_utils.py000066400000000000000000000053121470132070100175070ustar00rootroot00000000000000from datetime import datetime from enum import Enum from typing import Union import pytest from bson import ObjectId from monty.json import MSONable from pydantic import BaseModel, Field from maggma.api.utils import api_sanitize, serialization_helper class SomeEnum(Enum): A = 1 B = 2 C = 3 class Pet(MSONable): def __init__(self, name, age): self.name = name self.age = age class AnotherPet(MSONable): def __init__(self, name, age): self.name = name self.age = age class AnotherOwner(BaseModel): name: str = Field(..., description="Owner name") weight_or_pet: Union[float, AnotherPet] = Field(..., title="Owners weight or Pet") class Owner(BaseModel): name: str = Field(..., title="Owner's name") age: int = Field(..., title="Owne'r Age") weight: float = Field(..., title="Owner's weight") last_updated: datetime = Field(..., title="Last updated date for this record") pet: Pet = Field(..., title="Owner's Pet") other: SomeEnum = Field(..., title="A enum?") def test_api_sanitize(): # Ensure model validation fails with pytest.raises(ValueError): Owner() # This should still fail validation new_owner = api_sanitize(Owner, fields_to_leave=["Owner.name"]) with pytest.raises(ValueError): new_owner() new_owner(name="owner") # These will fail if non-optional fields are not turned off new_owner2 = api_sanitize(Owner) new_owner() # api_sanitize is in-place new_owner2() Owner() # This should fail type validation for pet with pytest.raises(Exception): Owner(pet="fido") temp_pet_dict = Pet(name="fido", age=3).as_dict() bad_pet_dict = dict(temp_pet_dict) del bad_pet_dict["@module"] del bad_pet_dict["@class"] # This should fail because of bad data type with pytest.raises(Exception): Owner(pet=bad_pet_dict) assert isinstance(Owner(pet=temp_pet_dict).pet, Pet) api_sanitize(Owner, allow_dict_msonable=True) # This should still fail because of bad data type with pytest.raises(Exception): Owner(pet=bad_pet_dict) # This should work assert isinstance(Owner(pet=temp_pet_dict).pet, dict) # This should work evne though AnotherPet is inside the Union type api_sanitize(AnotherOwner, allow_dict_msonable=True) temp_pet_dict = AnotherPet(name="fido", age=3).as_dict() assert isinstance(AnotherPet.validate_monty_v2(temp_pet_dict, None), dict) def test_serialization_helper(): oid = ObjectId("60b7d47bb671aa7b01a2adf6") assert serialization_helper(oid) == "60b7d47bb671aa7b01a2adf6" @pytest.mark.xfail() def test_serialization_helper_xfail(): oid = "test" serialization_helper(oid) maggma-0.70.0/tests/builders/000077500000000000000000000000001470132070100157755ustar00rootroot00000000000000maggma-0.70.0/tests/builders/__init__.py000066400000000000000000000000001470132070100200740ustar00rootroot00000000000000maggma-0.70.0/tests/builders/test_copy_builder.py000066400000000000000000000102771470132070100220750ustar00rootroot00000000000000""" Tests for MapBuilder """ from datetime import datetime, timedelta import pytest from maggma.builders import CopyBuilder from maggma.stores import MemoryStore @pytest.fixture() def source(): store = MemoryStore("source", key="k", last_updated_field="lu") store.connect() store.ensure_index("k") store.ensure_index("lu") return store @pytest.fixture() def target(): store = MemoryStore("target", key="k", last_updated_field="lu") store.connect() store.ensure_index("k") store.ensure_index("lu") return store @pytest.fixture(scope="module") def now(): return datetime.utcnow() @pytest.fixture() def old_docs(now): return [{"lu": now, "k": k, "v": "old"} for k in range(20)] @pytest.fixture() def new_docs(now): toc = now + timedelta(seconds=1) return [{"lu": toc, "k": k, "v": "new"} for k in range(10)] @pytest.fixture() def some_failed_old_docs(now): docs = [{"lu": now, "k": k, "v": "old", "state": "failed"} for k in range(3)] docs.extend([{"lu": now, "k": k, "v": "old", "state": "failed"} for k in range(18, 20)]) return docs def test_get_items(source, target, old_docs, some_failed_old_docs): builder = CopyBuilder(source, target) source.update(old_docs) assert len(list(builder.get_items())) == len(old_docs) target.update(old_docs) assert len(list(builder.get_items())) == 0 builder = CopyBuilder(source, target, projection=["k"]) target.remove_docs({}) assert len(list(builder.get_items())) == len(old_docs) assert all("v" not in d for d in builder.get_items()) source.update(some_failed_old_docs) target.update(old_docs) target.update(some_failed_old_docs) builder = CopyBuilder(source, target) assert len(list(builder.get_items())) == 0 builder = CopyBuilder(source, target, retry_failed=True) assert len(list(builder.get_items())) == len(some_failed_old_docs) builder = CopyBuilder(source, target, query={"k": {"$lt": 11}}) assert len(list(builder.get_items())) == 0 builder = CopyBuilder(source, target, retry_failed=True, query={"k": {"$lt": 11}}) assert len(list(builder.get_items())) == 3 def test_process_item(source, target, old_docs): builder = CopyBuilder(source, target) source.update(old_docs) items = list(builder.get_items()) assert len(items) == len(list(map(builder.process_item, items))) def test_update_targets(source, target, old_docs, new_docs): builder = CopyBuilder(source, target) builder.update_targets(old_docs) builder.update_targets(new_docs) assert target.query_one(criteria={"k": 0})["v"] == "new" assert target.query_one(criteria={"k": 10})["v"] == "old" def test_run(source, target, old_docs, new_docs): source.update(old_docs) source.update(new_docs) target.update(old_docs) builder = CopyBuilder(source, target) builder.run() builder.target.connect() assert builder.target.query_one(criteria={"k": 0})["v"] == "new" assert builder.target.query_one(criteria={"k": 10})["v"] == "old" def test_query(source, target, old_docs, new_docs): builder = CopyBuilder(source, target) builder.query = {"k": {"$gt": 5}} source.update(old_docs) source.update(new_docs) builder.run() all_docs = list(target.query(criteria={})) assert len(all_docs) == 14 assert min([d["k"] for d in all_docs]) == 6 def test_delete_orphans(source, target, old_docs, new_docs): builder = CopyBuilder(source, target, delete_orphans=True) source.update(old_docs) source.update(new_docs) target.update(old_docs) deletion_criteria = {"k": {"$in": list(range(5))}} source._collection.delete_many(deletion_criteria) builder.run() assert target._collection.count_documents(deletion_criteria) == 0 assert target.query_one(criteria={"k": 5})["v"] == "new" assert target.query_one(criteria={"k": 10})["v"] == "old" def test_prechunk(source, target, old_docs, new_docs): builder = CopyBuilder(source, target, delete_orphans=True) source.update(old_docs) source.update(new_docs) chunk_queries = list(builder.prechunk(2)) assert len(chunk_queries) == 2 assert chunk_queries[0] == {"query": {"k": {"$in": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}}} maggma-0.70.0/tests/builders/test_group_builder.py000066400000000000000000000040441470132070100222520ustar00rootroot00000000000000""" Tests for group builder """ from datetime import datetime, timezone from random import randint import pytest from maggma.builders import GroupBuilder from maggma.stores import MemoryStore @pytest.fixture(scope="module") def now(): return datetime.now(timezone.utc) @pytest.fixture() def docs(now): return [{"k": i, "a": i % 3, "b": randint(0, i), "lu": now} for i in range(20)] @pytest.fixture() def source(docs): store = MemoryStore("source", key="k", last_updated_field="lu") store.connect() store.ensure_index("k") store.ensure_index("lu") store.update(docs) return store @pytest.fixture() def target(): store = MemoryStore("target", key="ks", last_updated_field="lu") store.connect() store.ensure_index("ks") store.ensure_index("lu") return store class DummyGrouper(GroupBuilder): def unary_function(self, items: list[dict]) -> dict: """ Processing function for GroupBuilder Args: items: list of documents that are already grouped by the grouping_keys Returns: Dictionary mapping: tuple of source document keys that are in the grouped document to the grouped and processed document """ new_doc = {} for k in self.grouping_keys: new_doc[k] = {d[k] for d in items} new_doc["b"] = [d["b"] for d in items] return new_doc def test_grouping(source, target, docs): builder = DummyGrouper(source, target, query={"k": {"$ne": 3}}, grouping_keys=["a"]) assert len(docs) - 1 == len(builder.get_ids_to_process()), f"{len(docs) -1} != {len(builder.get_ids_to_process())}" assert len(builder.get_groups_from_keys([d["k"] for d in docs])) == 3 to_process = list(builder.get_items()) assert len(to_process) == 3 processed = [builder.process_item(d) for d in to_process] assert len(processed) == 3 builder.update_targets(processed) assert len(builder.get_ids_to_process()) == 0, f"{len(builder.get_ids_to_process())} != 0" maggma-0.70.0/tests/builders/test_projection_builder.py000066400000000000000000000077631470132070100233050ustar00rootroot00000000000000""" Tests for Projection_Builder """ import pytest from maggma.builders.projection_builder import Projection_Builder from maggma.stores import MemoryStore @pytest.fixture() def source1(): store = MemoryStore("source1", key="k", last_updated_field="lu") store.connect() store.ensure_index("k") store.ensure_index("lu") store.update([{"k": k, "a": "a", "b": "b"} for k in range(10)]) return store @pytest.fixture() def source2(): store = MemoryStore("source2", key="k", last_updated_field="lu") store.connect() store.ensure_index("k") store.ensure_index("lu") store.update([{"k": k, "c": "c", "d": "d"} for k in range(15)]) return store @pytest.fixture() def target(): store = MemoryStore("target", key="k", last_updated_field="lu") store.connect() store.ensure_index("k") store.ensure_index("lu") return store def test_get_items(source1, source2, target): builder = Projection_Builder(source_stores=[source1, source2], target_store=target) items = next(iter(builder.get_items())) assert len(items) == 25 def test_process_item(source1, source2, target): # test fields_to_project = empty dict and list builder = Projection_Builder( source_stores=[source1, source2], target_store=target, fields_to_project=[[], {}], ) items = next(iter(builder.get_items())) outputs = builder.process_item(items) assert len(outputs) == 15 output = next(o for o in outputs if o["k"] < 10) assert all(k in ["k", "a", "b", "c", "d"] for k in output) output = next(o for o in outputs if o["k"] > 9) assert all(k in ["k", "c", "d"] for k in output) assert all(k not in ["a", "b"] for k in output) # test fields_to_project = lists builder = Projection_Builder( source_stores=[source1, source2], target_store=target, fields_to_project=[["a", "b"], ["d"]], ) items = next(iter(builder.get_items())) outputs = builder.process_item(items) output = next(o for o in outputs if o["k"] < 10) assert all(k in ["k", "a", "b", "d"] for k in output) assert all(k not in ["c"] for k in output) # test fields_to_project = dict and list builder = Projection_Builder( source_stores=[source1, source2], target_store=target, fields_to_project=[{"newa": "a", "b": "b"}, ["d"]], ) items = next(iter(builder.get_items())) outputs = builder.process_item(items) output = next(o for o in outputs if o["k"] < 10) assert all(k in ["k", "newa", "b", "d"] for k in output) assert all(k not in ["a", "c"] for k in output) def test_update_targets(source1, source2, target): builder = Projection_Builder( source_stores=[source1, source2], target_store=target, fields_to_project=[{"newa": "a", "b": "b"}, ["d"]], ) items = list(builder.get_items()) processed_chunk = [builder.process_item(item) for item in items] processed_items = [item for item in processed_chunk if item is not None] builder.update_targets(processed_items) assert len(list(target.query())) == 15 assert target.query_one(criteria={"k": 0})["newa"] == "a" assert target.query_one(criteria={"k": 0})["d"] == "d" assert target.query_one(criteria={"k": 10})["d"] == "d" assert "a" not in target.query_one(criteria={"k": 10}) def test_run(source1, source2, target): builder = Projection_Builder(source_stores=[source1, source2], target_store=target) builder.run() assert len(list(target.query())) == 15 assert target.query_one(criteria={"k": 0})["a"] == "a" assert target.query_one(criteria={"k": 0})["d"] == "d" assert target.query_one(criteria={"k": 10})["d"] == "d" assert "a" not in target.query_one(criteria={"k": 10}) def test_query(source1, source2, target): target.remove_docs({}) builder = Projection_Builder( source_stores=[source1, source2], target_store=target, query_by_key=[0, 1, 2, 3, 4], ) builder.run() assert len(list(target.query())) == 5 maggma-0.70.0/tests/cli/000077500000000000000000000000001470132070100147335ustar00rootroot00000000000000maggma-0.70.0/tests/cli/__init__.py000066400000000000000000000000001470132070100170320ustar00rootroot00000000000000maggma-0.70.0/tests/cli/builder_for_test.py000066400000000000000000000011071470132070100206370ustar00rootroot00000000000000from maggma.core import Builder class DummyBuilder(Builder): def __init__(self, total=10): self.get_called = 0 self.process_called = 0 self.update_called = 0 super().__init__(sources=[], targets=[]) self.total = total def get_items(self): for _i in range(self.total): self.get_called += 1 yield self.get_called def process_item(self, item): self.process_called += 1 return item def update_targets(self, items): self.update_called += 1 __builder__ = DummyBuilder() maggma-0.70.0/tests/cli/builder_notebook_for_test.ipynb000066400000000000000000000037621470132070100232410ustar00rootroot00000000000000{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# This is a markdown test just to mess up the test" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/shyamd/Dropbox/Codes/maggma/src/maggma/utils.py:20: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", " from tqdm.autonotebook import tqdm\n" ] } ], "source": [ "from maggma.core import Builder" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "class DummyBuilder(Builder):\n", " def __init__(self, total=10):\n", " self.get_called = 0\n", " self.process_called = 0\n", " self.update_called = 0\n", " super().__init__(sources=[], targets=[])\n", " self.total = total\n", "\n", " def get_items(self):\n", " for i in range(self.total):\n", " self.get_called += 1\n", " yield self.get_called\n", "\n", " def process_item(self, item):\n", " self.process_called += 1\n", " return item\n", "\n", " def update_targets(self, items):\n", " self.update_called += 1" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "__builders__ = [DummyBuilder()]" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" } }, "nbformat": 4, "nbformat_minor": 4 } maggma-0.70.0/tests/cli/test_distributed.py000066400000000000000000000076571470132070100207050ustar00rootroot00000000000000import asyncio import json import socket as pysocket import threading import pytest import zmq.asyncio as zmq from zmq import REP, REQ from maggma.cli.distributed import find_port, manager, worker from maggma.core import Builder # TODO: Timeout errors? HOSTNAME = pysocket.gethostname() class DummyBuilderWithNoPrechunk(Builder): def __init__(self, dummy_prechunk: bool, val: int = -1, **kwargs): self.dummy_prechunk = dummy_prechunk self.connected = False self.kwargs = kwargs self.val = val super().__init__(sources=[], targets=[]) def connect(self): self.connected = True def get_items(self): return list(range(10)) def process_items(self, items): pass def update_targets(self, items): pass class DummyBuilder(DummyBuilderWithNoPrechunk): def prechunk(self, num_chunks): return [{"val": i} for i in range(num_chunks)] class DummyBuilderError(DummyBuilderWithNoPrechunk): def prechunk(self, num_chunks): return [{"val": i} for i in range(num_chunks)] def get_items(self): raise ValueError("Dummy error") def process_items(self, items): raise ValueError("Dummy error") SERVER_URL = "tcp://127.0.0.1" SERVER_PORT = 1234 @pytest.mark.xfail(raises=ValueError) def test_wrong_worker_input(log_to_stdout): manager( SERVER_URL, SERVER_PORT, [DummyBuilder(dummy_prechunk=False)], num_chunks=2, num_workers=0, ) def test_manager_and_worker(log_to_stdout): manager_thread = threading.Thread( target=manager, args=(SERVER_URL, SERVER_PORT, [DummyBuilder(dummy_prechunk=False)], 5, 3), ) manager_thread.start() worker_threads = [threading.Thread(target=worker, args=(SERVER_URL, SERVER_PORT, 1, True)) for _ in range(3)] for worker_thread in worker_threads: worker_thread.start() for worker_thread in worker_threads: worker_thread.join() manager_thread.join() @pytest.mark.asyncio() async def test_manager_worker_error(log_to_stdout): manager_thread = threading.Thread( target=manager, args=(SERVER_URL, SERVER_PORT, [DummyBuilder(dummy_prechunk=False)], 10, 1), ) manager_thread.start() context = zmq.Context() socket = context.socket(REQ) socket.connect(f"{SERVER_URL}:{SERVER_PORT}") await socket.send(b"ERROR_testerror") await asyncio.sleep(1) manager_thread.join() @pytest.mark.asyncio() async def test_worker_error(): context = zmq.Context() socket = context.socket(REP) socket.bind(f"{SERVER_URL}:{SERVER_PORT}") worker_task = threading.Thread(target=worker, args=(SERVER_URL, SERVER_PORT, 1, True)) worker_task.start() message = await socket.recv() assert message == f"READY_{HOSTNAME}".encode() dummy_work = { "@module": "tests.cli.test_distributed", "@class": "DummyBuilderError", "@version": None, "dummy_prechunk": False, "val": 0, } await socket.send(json.dumps(dummy_work).encode("utf-8")) await asyncio.sleep(1) message = await socket.recv() assert message.decode("utf-8") == "ERROR_Dummy error" worker_task.join() @pytest.mark.asyncio() async def test_worker_exit(): context = zmq.Context() socket = context.socket(REP) socket.bind(f"{SERVER_URL}:{SERVER_PORT}") worker_task = threading.Thread(target=worker, args=(SERVER_URL, SERVER_PORT, 1, True)) worker_task.start() message = await socket.recv() assert message == f"READY_{HOSTNAME}".encode() await asyncio.sleep(1) await socket.send(b"EXIT") await asyncio.sleep(1) assert not worker_task.is_alive() worker_task.join() @pytest.mark.xfail() def test_no_prechunk(caplog): manager( SERVER_URL, SERVER_PORT, [DummyBuilderWithNoPrechunk(dummy_prechunk=False)], 10, 1, ) def test_find_port(): assert find_port() > 0 maggma-0.70.0/tests/cli/test_init.py000066400000000000000000000156361470132070100173220ustar00rootroot00000000000000import shutil from datetime import datetime from pathlib import Path import pytest from click.testing import CliRunner from monty.serialization import dumpfn from maggma.builders import CopyBuilder from maggma.cli import run from maggma.stores import MemoryStore, MongoStore @pytest.fixture() def mongostore(): store = MongoStore("maggma_test", "test") store.connect() store.remove_docs({}) yield store store.remove_docs({}) store._collection.drop() @pytest.fixture() def reporting_store(): store = MongoStore("maggma_test", "reporting") store.connect() store.remove_docs({}) yield store store.remove_docs({}) store._collection.drop() def test_basic_run(): runner = CliRunner() result = runner.invoke(run, ["--help"]) assert result.exit_code == 0 # Ensure running without any builders fail result = runner.invoke(run) assert result.exit_code != 0 def test_run_builder(mongostore): memorystore = MemoryStore("temp") builder = CopyBuilder(mongostore, memorystore) mongostore.update([{mongostore.key: i, mongostore.last_updated_field: datetime.utcnow()} for i in range(10)]) runner = CliRunner() with runner.isolated_filesystem(): dumpfn(builder, "test_builder.json") result = runner.invoke(run, ["-v", "test_builder.json"]) assert result.exit_code == 0 assert "CopyBuilder" in result.output assert "SerialProcessor" in result.output result = runner.invoke(run, ["-vvv", "--no_bars", "test_builder.json"]) assert result.exit_code == 0 assert "Get" not in result.output assert "Update" not in result.output result = runner.invoke(run, ["-v", "-n", "2", "test_builder.json"]) assert result.exit_code == 0 assert "CopyBuilder" in result.output assert "MultiProcessor" in result.output result = runner.invoke(run, ["-vvv", "-n", "2", "--no_bars", "test_builder.json"]) assert result.exit_code == 0 assert "Get" not in result.output assert "Update" not in result.output def test_run_builder_chain(mongostore): memorystore = MemoryStore("temp") builder1 = CopyBuilder(mongostore, memorystore) builder2 = CopyBuilder(mongostore, memorystore) mongostore.update([{mongostore.key: i, mongostore.last_updated_field: datetime.utcnow()} for i in range(10)]) runner = CliRunner() with runner.isolated_filesystem(): dumpfn([builder1, builder2], "test_builders.json") result = runner.invoke(run, ["-v", "test_builders.json"]) assert result.exit_code == 0 assert "CopyBuilder" in result.output assert "SerialProcessor" in result.output result = runner.invoke(run, ["-vvv", "--no_bars", "test_builders.json"]) assert result.exit_code == 0 assert "Get" not in result.output assert "Update" not in result.output result = runner.invoke(run, ["-v", "-n", "2", "test_builders.json"]) assert result.exit_code == 0 assert "CopyBuilder" in result.output assert "MultiProcessor" in result.output result = runner.invoke(run, ["-vvv", "-n", "2", "--no_bars", "test_builders.json"]) assert result.exit_code == 0 assert "Get" not in result.output assert "Update" not in result.output def test_reporting(mongostore, reporting_store): memorystore = MemoryStore("temp") builder = CopyBuilder(mongostore, memorystore) mongostore.update([{mongostore.key: i, mongostore.last_updated_field: datetime.utcnow()} for i in range(10)]) runner = CliRunner() with runner.isolated_filesystem(): dumpfn(builder, "test_builder.json") dumpfn(reporting_store, "test_reporting_store.json") result = runner.invoke(run, ["-v", "test_builder.json", "-r", "test_reporting_store.json"]) assert result.exit_code == 0 report_docs = list(reporting_store.query()) assert len(report_docs) == 3 start_doc = next(d for d in report_docs if d["event"] == "BUILD_STARTED") assert "sources" in start_doc assert "targets" in start_doc end_doc = next(d for d in report_docs if d["event"] == "BUILD_ENDED") assert "errors" in end_doc assert "warnings" in end_doc update_doc = next(d for d in report_docs if d["event"] == "UPDATE") assert "items" in update_doc def test_python_source(): runner = CliRunner() with runner.isolated_filesystem(): shutil.copy2(src=Path(__file__).parent / "builder_for_test.py", dst=Path(".").resolve()) result = runner.invoke(run, ["-v", "-n", "2", "builder_for_test.py"]) assert result.exit_code == 0 assert "Ended multiprocessing: DummyBuilder" in result.output def test_python_notebook_source(): runner = CliRunner() with runner.isolated_filesystem(): shutil.copy2( src=Path(__file__).parent / "builder_notebook_for_test.ipynb", dst=Path(".").resolve(), ) result = runner.invoke(run, ["-v", "-n", "2", "builder_notebook_for_test.ipynb"]) assert result.exit_code == 0 assert "Ended multiprocessing: DummyBuilder" in result.output def test_memray_run_builder(mongostore): memorystore = MemoryStore("temp") builder = CopyBuilder(mongostore, memorystore) mongostore.update([{mongostore.key: i, mongostore.last_updated_field: datetime.utcnow()} for i in range(10)]) runner = CliRunner() with runner.isolated_filesystem(): dumpfn(builder, "test_builder.json") result = runner.invoke(run, ["-v", "--memray", "on", "test_builder.json"]) assert result.exit_code == 0 assert "CopyBuilder" in result.output assert "SerialProcessor" in result.output result = runner.invoke(run, ["-vvv", "--no_bars", "--memray", "on", "test_builder.json"]) assert result.exit_code == 0 assert "Get" not in result.output assert "Update" not in result.output result = runner.invoke(run, ["-v", "-n", "2", "--memray", "on", "test_builder.json"]) assert result.exit_code == 0 assert "CopyBuilder" in result.output assert "MultiProcessor" in result.output result = runner.invoke(run, ["-vvv", "-n", "2", "--no_bars", "--memray", "on", "test_builder.json"]) assert result.exit_code == 0 assert "Get" not in result.output assert "Update" not in result.output def test_memray_user_output_dir(mongostore): memorystore = MemoryStore("temp") builder = CopyBuilder(mongostore, memorystore) mongostore.update([{mongostore.key: i, mongostore.last_updated_field: datetime.utcnow()} for i in range(10)]) runner = CliRunner() with runner.isolated_filesystem(): dumpfn(builder, "test_builder.json") result = runner.invoke(run, ["--memray", "on", "-md", "memray_output_dir/", "test_builder.json"]) assert result.exit_code == 0 assert (Path.cwd() / "memray_output_dir").exists() is True maggma-0.70.0/tests/cli/test_multiprocessing.py000066400000000000000000000035361470132070100216020ustar00rootroot00000000000000import time from concurrent.futures import ThreadPoolExecutor import pytest from maggma.cli.multiprocessing import AsyncUnorderedMap, BackPressure, grouper, safe_dispatch @pytest.mark.asyncio() async def test_grouper(): async def arange(count): for i in range(count): yield (i) async for group in grouper(arange(100), n=10): assert len(group) == 10 async for group in grouper(arange(9), n=10): assert len(group) == 9 def wait_and_return(x): time.sleep(1) return x * x async def arange(n): for num in range(n): yield num @pytest.mark.asyncio() async def test_backpressure(): iterable = range(10) backpressure = BackPressure(iterable, 2) # Put two items into the process queue await backpressure.__anext__() await backpressure.__anext__() # Ensure back_pressure enabled assert backpressure.back_pressure.locked() # Release back pressure releaser = backpressure.release(arange(10)) await releaser.__anext__() assert not backpressure.back_pressure.locked() # Ensure can keep releasing backing pressure and won't error await releaser.__anext__() await releaser.__anext__() # Ensure stop iteration works with pytest.raises(StopAsyncIteration): # noqa: PT012 for _i in range(10): await releaser.__anext__() assert not backpressure.back_pressure.locked() @pytest.mark.asyncio() async def test_async_map(): executor = ThreadPoolExecutor(1) amap = AsyncUnorderedMap(wait_and_return, arange(3), executor) true_values = {x * x for x in range(3)} finished_vals = set() async for finished_val in amap: finished_vals.add(finished_val) assert finished_vals == true_values def test_safe_dispatch(): def bad_func(val): raise ValueError("AAAH") safe_dispatch((bad_func, "")) maggma-0.70.0/tests/cli/test_serial.py000066400000000000000000000014041470132070100176220ustar00rootroot00000000000000from maggma.cli.serial import serial from maggma.core import Builder class TestBuilder(Builder): def __init__(self, total=10): self.get_called = 0 self.process_called = 0 self.update_called = 0 super().__init__(sources=[], targets=[]) self.total = total def get_items(self): for _i in range(self.total): self.get_called += 1 yield self.get_called def process_item(self, item): self.process_called += 1 return item def update_targets(self, items): self.update_called += 1 def test_serial(): builder = TestBuilder() serial(builder) assert builder.get_called == 10 assert builder.process_called == 10 assert builder.update_called == 1 maggma-0.70.0/tests/conftest.py000066400000000000000000000032261470132070100163660ustar00rootroot00000000000000import logging import sys from pathlib import Path import pytest @pytest.fixture() def tmp_dir(): # noqa: PT004 """ Create a clean directory and cd into it. The directory will be removed at the end of the test. """ import os import shutil import tempfile old_cwd = os.getcwd() newpath = tempfile.mkdtemp() os.chdir(newpath) yield os.chdir(old_cwd) shutil.rmtree(newpath) @pytest.fixture() def test_dir(): module_dir = Path(__file__).resolve().parent test_dir = module_dir / "test_files" return test_dir.resolve() @pytest.fixture() def db_json(test_dir): db_dir = test_dir / "settings_files" db_json = db_dir / "db.json" return db_json.resolve() @pytest.fixture() def lp_file(test_dir): db_dir = test_dir / "settings_files" lp_file = db_dir / "my_launchpad.yaml" return lp_file.resolve() @pytest.fixture() def log_to_stdout(): # Set Logging root = logging.getLogger() root.setLevel(logging.DEBUG) ch = logging.StreamHandler(sys.stdout) formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") ch.setFormatter(formatter) root.addHandler(ch) return root def pytest_itemcollected(item): """Make tests names more readable in the tests output.""" item._nodeid = ( item._nodeid.replace(".py", "") .replace("tests/", "") .replace("test_", "") .replace("_", " ") .replace("Test", "") .replace("Class", " class") .lower() ) doc = item.obj.__doc__.strip() if item.obj.__doc__ else "" if doc: item._nodeid = item._nodeid.split("::")[0] + "::" + doc maggma-0.70.0/tests/stores/000077500000000000000000000000001470132070100155035ustar00rootroot00000000000000maggma-0.70.0/tests/stores/__init__.py000066400000000000000000000000001470132070100176020ustar00rootroot00000000000000maggma-0.70.0/tests/stores/test_advanced_stores.py000066400000000000000000000275121470132070100222670ustar00rootroot00000000000000""" Tests for advanced stores """ import os import shutil import signal import subprocess import tempfile import time from unittest.mock import patch from uuid import uuid4 import pytest from mongogrant import Client from mongogrant.client import check, seed from mongogrant.config import Config from pymongo import MongoClient from pymongo.collection import Collection from maggma.core import StoreError from maggma.stores import AliasingStore, MemoryStore, MongograntStore, MongoStore, SandboxStore, VaultStore from maggma.stores.advanced_stores import substitute @pytest.fixture() def mongostore(): store = MongoStore("maggma_test", "test") store.connect() yield store store._collection.drop() @pytest.fixture(scope="module") def mgrant_server(): # TODO: This is whacked code that starts a mongo server. How do we fix this? _, config_path = tempfile.mkstemp() _, mdlogpath = tempfile.mkstemp() mdpath = tempfile.mkdtemp() mdport = 27020 if not os.getenv("CONTINUOUS_INTEGRATION"): basecmd = f"mongod --port {mdport} --dbpath {mdpath} --quiet --logpath {mdlogpath} --bind_ip_all --auth" mongod_process = subprocess.Popen(basecmd, shell=True, start_new_session=True) time.sleep(5) client = MongoClient(port=mdport) client.admin.command("createUser", "mongoadmin", pwd="mongoadminpass", roles=["root"]) client.close() else: pytest.skip("Disabling mongogrant tests on CI for now") dbname = "test_" + uuid4().hex db = MongoClient(f"mongodb://mongoadmin:mongoadminpass@127.0.0.1:{mdport}/admin")[dbname] db.command("createUser", "reader", pwd="readerpass", roles=["read"]) db.command("createUser", "writer", pwd="writerpass", roles=["readWrite"]) db.client.close() # Yields the fixture to use yield config_path, mdport, dbname if not (os.getenv("CONTINUOUS_INTEGRATION") and os.getenv("TRAVIS")): os.killpg(os.getpgid(mongod_process.pid), signal.SIGTERM) os.waitpid(mongod_process.pid, 0) os.remove(config_path) shutil.rmtree(mdpath) os.remove(mdlogpath) @pytest.fixture(scope="module") def mgrant_user(mgrant_server): config_path, mdport, dbname = mgrant_server config = Config(check=check, path=config_path, seed=seed()) client = Client(config) client.set_auth( host=f"localhost:{mdport}", db=dbname, role="read", username="reader", password="readerpass", ) client.set_auth( host=f"localhost:{mdport}", db=dbname, role="readWrite", username="writer", password="writerpass", ) client.set_alias("testhost", f"localhost:{mdport}", which="host") client.set_alias("testdb", dbname, which="db") return client def connected_user(store): return store._collection.database.command("connectionStatus")["authInfo"]["authenticatedUsers"][0]["user"] def test_mgrant_init(): with pytest.raises(StoreError): store = MongograntStore("", "", username="") with pytest.raises(ValueError): # noqa: PT012 store = MongograntStore("", "") store.connect() def test_mgrant_connect(mgrant_server, mgrant_user): config_path, mdport, dbname = mgrant_server assert mgrant_user is not None store = MongograntStore("ro:testhost/testdb", "tasks", mgclient_config_path=config_path) store.connect() assert isinstance(store._collection, Collection) assert connected_user(store) == "reader" store = MongograntStore("rw:testhost/testdb", "tasks", mgclient_config_path=config_path) store.connect() assert isinstance(store._collection, Collection) assert connected_user(store) == "writer" def test_mgrant_differences(): with pytest.raises(ValueError): MongograntStore.from_db_file("") with pytest.raises(ValueError): MongograntStore.from_collection("") def test_mgrant_equal(mgrant_server, mgrant_user): config_path, mdport, dbname = mgrant_server assert mgrant_user is not None store1 = MongograntStore("ro:testhost/testdb", "tasks", mgclient_config_path=config_path) store1.connect() store2 = MongograntStore("ro:testhost/testdb", "tasks", mgclient_config_path=config_path) store3 = MongograntStore("ro:testhost/testdb", "test", mgclient_config_path=config_path) store2.connect() assert store1 == store2 assert store1 != store3 def vault_store(): with patch("hvac.Client") as mock: instance = mock.return_value instance.auth_github.return_value = True instance.is_authenticated.return_value = True instance.read.return_value = { "wrap_info": None, "request_id": "2c72c063-2452-d1cd-19a2-91163c7395f7", "data": { "value": '{"db": "mg_core_prod", "host": "matgen2.lbl.gov", "username": "test", "password": "pass"}' }, "auth": None, "warnings": None, "renewable": False, "lease_duration": 2764800, "lease_id": "", } return VaultStore("test_coll", "secret/matgen/maggma") def test_vault_init(): """ Test initing a vault store using a mock hvac client """ os.environ["VAULT_ADDR"] = "https://fake:8200/" os.environ["VAULT_TOKEN"] = "dummy" # Just test that we successfully instantiated v = vault_store() assert isinstance(v, MongoStore) def test_vault_github_token(): """ Test using VaultStore with GITHUB_TOKEN and mock hvac """ # Save token in env os.environ["VAULT_ADDR"] = "https://fake:8200/" os.environ["GITHUB_TOKEN"] = "dummy" v = vault_store() # Just test that we successfully instantiated assert isinstance(v, MongoStore) def test_vault_missing_env(): """ Test VaultStore should raise an error if environment is not set """ del os.environ["VAULT_TOKEN"] del os.environ["VAULT_ADDR"] del os.environ["GITHUB_TOKEN"] # Create should raise an error with pytest.raises(RuntimeError): vault_store() @pytest.fixture() def alias_store(): memorystore = MemoryStore("test") memorystore.connect() return AliasingStore(memorystore, {"a": "b", "c.d": "e", "f": "g.h"}) def test_alias_count(alias_store): d = [{"b": 1}, {"e": 2}, {"g": {"h": 3}}] alias_store.store._collection.insert_many(d) assert alias_store.count({"a": 1}) == 1 def test_aliasing_query(alias_store): d = [{"b": 1}, {"e": 2}, {"g": {"h": 3}}] alias_store.store._collection.insert_many(d) assert "a" in next(iter(alias_store.query(criteria={"a": {"$exists": 1}}))) assert "c" in next(iter(alias_store.query(criteria={"c.d": {"$exists": 1}}))) assert "d" in next(iter(alias_store.query(criteria={"c.d": {"$exists": 1}}))).get("c", {}) assert "f" in next(iter(alias_store.query(criteria={"f": {"$exists": 1}}))) def test_aliasing_update(alias_store): alias_store.update( [ {"task_id": "mp-3", "a": 4}, {"task_id": "mp-4", "c": {"d": 5}}, {"task_id": "mp-5", "f": 6}, ] ) assert next(iter(alias_store.query(criteria={"task_id": "mp-3"})))["a"] == 4 assert next(iter(alias_store.query(criteria={"task_id": "mp-4"})))["c"]["d"] == 5 assert next(iter(alias_store.query(criteria={"task_id": "mp-5"})))["f"] == 6 assert next(iter(alias_store.store.query(criteria={"task_id": "mp-3"})))["b"] == 4 assert next(iter(alias_store.store.query(criteria={"task_id": "mp-4"})))["e"] == 5 assert next(iter(alias_store.store.query(criteria={"task_id": "mp-5"})))["g"]["h"] == 6 def test_aliasing_remove_docs(alias_store): alias_store.update( [ {"task_id": "mp-3", "a": 4}, {"task_id": "mp-4", "c": {"d": 5}}, {"task_id": "mp-5", "f": 6}, ] ) assert alias_store.query_one(criteria={"task_id": "mp-3"}) assert alias_store.query_one(criteria={"task_id": "mp-4"}) assert alias_store.query_one(criteria={"task_id": "mp-5"}) alias_store.remove_docs({"a": 4}) assert alias_store.query_one(criteria={"task_id": "mp-3"}) is None def test_aliasing_substitute(alias_store): aliases = {"a": "b", "c.d": "e", "f": "g.h"} d = {"b": 1} substitute(d, aliases) assert "a" in d d = {"e": 1} substitute(d, aliases) assert "c" in d assert "d" in d.get("c", {}) d = {"g": {"h": 4}} substitute(d, aliases) assert "f" in d d = None substitute(d, aliases) assert d is None def test_aliasing_distinct(alias_store): d = [{"b": 1}, {"e": 2}, {"g": {"h": 3}}] alias_store.store._collection.insert_many(d) assert alias_store.distinct("a") == [1] assert alias_store.distinct("c.d") == [2] assert alias_store.distinct("f") == [3] @pytest.fixture() def sandbox_store(): memstore = MemoryStore() store = SandboxStore(memstore, sandbox="test") store.connect() return store def test_sandbox_count(sandbox_store): sandbox_store._collection.insert_one({"a": 1, "b": 2, "c": 3}) assert sandbox_store.count({"a": 1}) == 1 sandbox_store._collection.insert_one({"a": 1, "b": 3, "sbxn": ["test"]}) assert sandbox_store.count({"a": 1}) == 2 def test_sandbox_query(sandbox_store): sandbox_store._collection.insert_one({"a": 1, "b": 2, "c": 3}) assert sandbox_store.query_one(properties=["a"])["a"] == 1 sandbox_store._collection.insert_one({"a": 2, "b": 2, "sbxn": ["test"]}) assert sandbox_store.query_one(properties=["b"], criteria={"a": 2})["b"] == 2 sandbox_store._collection.insert_one({"a": 3, "b": 2, "sbxn": ["not_test"]}) assert sandbox_store.query_one(properties=["c"], criteria={"a": 3}) is None def test_sandbox_distinct(sandbox_store): sandbox_store.connect() sandbox_store._collection.insert_one({"a": 1, "b": 2, "c": 3}) assert sandbox_store.distinct("a") == [1] sandbox_store._collection.insert_one({"a": 4, "d": 5, "e": 6, "sbxn": ["test"]}) assert sandbox_store.distinct("a")[1] == 4 sandbox_store._collection.insert_one({"a": 7, "d": 8, "e": 9, "sbxn": ["not_test"]}) assert sandbox_store.distinct("a")[1] == 4 def test_sandbox_update(sandbox_store): sandbox_store.connect() sandbox_store.update([{"e": 6, "d": 4}], key="e") assert next(sandbox_store.query(criteria={"d": {"$exists": 1}}, properties=["d"]))["d"] == 4 assert sandbox_store._collection.find_one({"e": 6})["sbxn"] == ["test"] sandbox_store.update([{"e": 7, "sbxn": ["core"]}], key="e") assert set(sandbox_store.query_one(criteria={"e": 7})["sbxn"]) == {"test", "core"} def test_sandbox_remove_docs(sandbox_store): sandbox_store.connect() sandbox_store.update([{"e": 6, "d": 4}], key="e") sandbox_store.update([{"e": 7, "sbxn": ["core"]}], key="e") assert sandbox_store.query_one(criteria={"d": {"$exists": 1}}, properties=["d"]) assert sandbox_store.query_one(criteria={"e": 7}) sandbox_store.remove_docs(criteria={"d": 4}) assert sandbox_store.query_one(criteria={"d": {"$exists": 1}}, properties=["d"]) is None assert sandbox_store.query_one(criteria={"e": 7}) @pytest.fixture() def mgrantstore(mgrant_server, mgrant_user): config_path, mdport, dbname = mgrant_server assert mgrant_user is not None store = MongograntStore("ro:testhost/testdb", "tasks", mgclient_config_path=config_path) store.connect() return store @pytest.fixture() def vaultstore(): os.environ["VAULT_ADDR"] = "https://fake:8200/" os.environ["VAULT_TOKEN"] = "dummy" # Just test that we successfully instantiated return vault_store() def test_eq_mgrant(mgrantstore, mongostore): assert mgrantstore == mgrantstore assert mgrantstore != mongostore def test_eq(vaultstore, alias_store, sandbox_store): assert alias_store == alias_store assert sandbox_store == sandbox_store assert vaultstore == vaultstore assert sandbox_store != alias_store assert alias_store != vaultstore assert vaultstore != sandbox_store maggma-0.70.0/tests/stores/test_aws.py000066400000000000000000000331131470132070100177070ustar00rootroot00000000000000import time from datetime import datetime import boto3 import pytest from botocore.exceptions import ClientError from moto import mock_aws from sshtunnel import BaseSSHTunnelForwarderError from maggma.stores import MemoryStore, MongoStore, S3Store from maggma.stores.ssh_tunnel import SSHTunnel @pytest.fixture() def mongostore(): store = MongoStore("maggma_test", "test") store.connect() yield store store._collection.drop() @pytest.fixture() def ssh_tunnel(): try: tunnel = SSHTunnel("127.0.0.1:22", "127.0.0.1:27017", local_port=9000) except (ValueError, BaseSSHTunnelForwarderError): # fallback to not use a tunnel if there is error in creating the tunnel tunnel = None return tunnel @pytest.fixture() def s3store(): with mock_aws(): conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket="bucket1") index = MemoryStore("index", key="task_id") store = S3Store(index, "bucket1", key="task_id") store.connect() store.update( [ { "task_id": "mp-1", "data": "asd", store.last_updated_field: datetime.utcnow(), } ] ) store.update( [ { "task_id": "mp-3", "data": "sdf", store.last_updated_field: datetime.utcnow(), } ] ) yield store @pytest.fixture() def s3store_w_subdir(): with mock_aws(): conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket="bucket1") index = MemoryStore("index") store = S3Store(index, "bucket1", sub_dir="subdir1", s3_workers=1) store.connect() yield store @pytest.fixture() def s3store_multi(): with mock_aws(): conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket="bucket1") index = MemoryStore("index") store = S3Store(index, "bucket1", s3_workers=4) store.connect() yield store @pytest.fixture() def s3store_with_tunnel(ssh_tunnel): with mock_aws(): conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket="bucket1") index = MemoryStore("index", key="task_id") store = S3Store(index, "bucket1", key="task_id", ssh_tunnel=ssh_tunnel) store.connect() yield store def test_keys(): with mock_aws(): conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket="bucket1") index = MemoryStore("index", key=1) with pytest.raises(AssertionError, match=r"Since we are.*"): S3Store(index, "bucket1", s3_workers=4, key="1") index = MemoryStore("index", key="key1") with pytest.warns(UserWarning, match=r"The desired S3Store.*$"): store = S3Store(index, "bucket1", s3_workers=4, key="key2") store.connect() store.update({"key1": "mp-1", "data": "1234"}) with pytest.raises(KeyError): store.update({"key2": "mp-2", "data": "1234"}) assert store.key == store.index.key == "key1" def test_multi_update(s3store, s3store_multi): data = [ { "task_id": str(j), "data": "DATA", s3store_multi.last_updated_field: datetime.utcnow(), } for j in range(32) ] def fake_writing(doc, search_keys): time.sleep(0.20) return {k: doc[k] for k in search_keys} s3store.write_doc_to_s3 = fake_writing s3store_multi.write_doc_to_s3 = fake_writing start = time.time() s3store_multi.update(data, key=["task_id"]) end = time.time() time_multi = end - start start = time.time() s3store.update(data, key=["task_id"]) end = time.time() time_single = end - start assert time_single > time_multi * (s3store_multi.s3_workers - 1) / (s3store.s3_workers) def test_count(s3store): assert s3store.count() == 2 assert s3store.count({"task_id": "mp-3"}) == 1 def test_query(s3store): assert s3store.query_one(criteria={"task_id": "mp-2"}) is None assert s3store.query_one(criteria={"task_id": "mp-1"})["data"] == "asd" assert s3store.query_one(criteria={"task_id": "mp-3"})["data"] == "sdf" assert len(list(s3store.query())) == 2 def test_update(s3store): s3store.update( [ { "task_id": "mp-199999", "data": "asd", s3store.last_updated_field: datetime.utcnow(), } ] ) assert s3store.query_one({"task_id": "mp-199999"}) is not None s3store.compress = True s3store.update([{"task_id": "mp-4", "data": "asd"}]) obj = s3store.index.query_one({"task_id": "mp-4"}) assert obj["compression"] == "zlib" assert obj["obj_hash"] == "be74de5ac71f00ec9e96441a3c325b0592c07f4c" assert s3store.query_one({"task_id": "mp-4"})["data"] == "asd" def test_rebuild_meta_from_index(s3store): s3store.update([{"task_id": "mp-2", "data": "asd"}]) s3store.index.update({"task_id": "mp-2", "add_meta": "hello"}) s3store.rebuild_metadata_from_index() s3_object = s3store.s3_bucket.Object("mp-2") assert s3_object.metadata["add_meta"] == "hello" def test_rebuild_index(s3store): s3store.update([{"task_id": "mp-2", "data": "asd"}]) assert s3store.index.query_one({"task_id": "mp-2"})["obj_hash"] == "a69fe0c2cca3a3384c2b1d2f476972704f179741" s3store.index.remove_docs({}) assert s3store.index.query_one({"task_id": "mp-2"}) is None s3store.rebuild_index_from_s3_data() assert s3store.index.query_one({"task_id": "mp-2"})["obj_hash"] == "a69fe0c2cca3a3384c2b1d2f476972704f179741" def tests_msonable_read_write(s3store): dd = s3store.as_dict() s3store.update([{"task_id": "mp-2", "data": dd}]) res = s3store.query_one({"task_id": "mp-2"}) assert res["data"]["@module"] == "maggma.stores.aws" def test_remove(s3store): def objects_in_bucket(key): objs = list(s3store.s3_bucket.objects.filter(Prefix=key)) return key in [o.key for o in objs] s3store.update([{"task_id": "mp-2", "data": "asd"}]) s3store.update([{"task_id": "mp-4", "data": "asd"}]) s3store.update({"task_id": "mp-5", "data": "aaa"}) assert s3store.query_one({"task_id": "mp-2"}) is not None assert s3store.query_one({"task_id": "mp-4"}) is not None assert objects_in_bucket("mp-2") assert objects_in_bucket("mp-4") s3store.remove_docs({"task_id": "mp-2"}) s3store.remove_docs({"task_id": "mp-4"}, remove_s3_object=True) assert objects_in_bucket("mp-2") assert not objects_in_bucket("mp-4") assert s3store.query_one({"task_id": "mp-5"}) is not None def test_close(s3store): list(s3store.query()) s3store.close() with pytest.raises(AttributeError): list(s3store.query()) def test_bad_import(mocker): mocker.patch("maggma.stores.aws.boto3", None) index = MemoryStore("index") with pytest.raises(RuntimeError): S3Store(index, "bucket1") def test_aws_error(s3store): def raise_exception_NoSuchKey(data): error_response = {"Error": {"Code": "NoSuchKey", "Message": "The specified key does not exist."}} raise ClientError(error_response, "raise_exception") def raise_exception_other(data): error_response = {"Error": {"Code": 405}} raise ClientError(error_response, "raise_exception") s3store.s3_bucket.Object = raise_exception_other with pytest.raises(ClientError): s3store.query_one() # Should just pass s3store.s3_bucket.Object = raise_exception_NoSuchKey s3store.query_one() def test_eq(mongostore, s3store): assert s3store == s3store assert mongostore != s3store def test_count_subdir(s3store_w_subdir): s3store_w_subdir.update([{"task_id": "mp-1", "data": "asd"}]) s3store_w_subdir.update([{"task_id": "mp-2", "data": "asd"}]) assert s3store_w_subdir.count() == 2 assert s3store_w_subdir.count({"task_id": "mp-2"}) == 1 def test_subdir_field(s3store_w_subdir): s3store_w_subdir.update([{"task_id": "mp-1", "data": "asd"}]) s3store_w_subdir.update([{"task_id": "mp-2", "data": "asd"}]) for cc in s3store_w_subdir.index.query(): assert len(cc["sub_dir"]) > 0 assert cc["sub_dir"] == s3store_w_subdir.sub_dir def test_remove_subdir(s3store_w_subdir): s3store_w_subdir.update([{"task_id": "mp-2", "data": "asd"}]) s3store_w_subdir.update([{"task_id": "mp-4", "data": "asd"}]) assert s3store_w_subdir.query_one({"task_id": "mp-2"}) is not None assert s3store_w_subdir.query_one({"task_id": "mp-4"}) is not None s3store_w_subdir.remove_docs({"task_id": "mp-2"}) assert s3store_w_subdir.query_one({"task_id": "mp-2"}) is None assert s3store_w_subdir.query_one({"task_id": "mp-4"}) is not None def test_searchable_fields(s3store): tic = datetime(2018, 4, 12, 16) data = [{"task_id": f"mp-{i}", "a": i, s3store.last_updated_field: tic} for i in range(4)] s3store.searchable_fields = ["task_id"] s3store.update(data, key="a") # This should only work if the searchable field was put into the index store assert set(s3store.distinct("task_id")) == {"mp-0", "mp-1", "mp-2", "mp-3"} def test_newer_in(s3store): with mock_aws(): tic = datetime(2018, 4, 12, 16) tic2 = datetime.utcnow() conn = boto3.client("s3") conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket="bucket") index_old = MemoryStore("index_old") old_store = S3Store(index_old, "bucket") old_store.connect() old_store.update([{"task_id": "mp-1", "last_updated": tic}]) old_store.update([{"task_id": "mp-2", "last_updated": tic}]) index_new = MemoryStore("index_new") new_store = S3Store(index_new, "bucket") new_store.connect() new_store.update([{"task_id": "mp-1", "last_updated": tic2}]) new_store.update([{"task_id": "mp-2", "last_updated": tic2}]) assert len(old_store.newer_in(new_store)) == 2 assert len(new_store.newer_in(old_store)) == 0 assert len(old_store.newer_in(new_store.index)) == 2 assert len(new_store.newer_in(old_store.index)) == 0 def test_additional_metadata(s3store): tic = datetime(2018, 4, 12, 16) data = [{"task_id": f"mp-{i}", "a": i, s3store.last_updated_field: tic} for i in range(4)] s3store.update(data, key="a", additional_metadata="task_id") # This should only work if the searchable field was put into the index store assert set(s3store.distinct("task_id")) == {"mp-0", "mp-1", "mp-2", "mp-3"} def test_get_session(s3store): index = MemoryStore("index") store = S3Store( index, "bucket1", s3_profile={ "aws_access_key_id": "ACCESS_KEY", "aws_secret_access_key": "SECRET_KEY", }, ) assert store._get_session().get_credentials().access_key == "ACCESS_KEY" assert store._get_session().get_credentials().secret_key == "SECRET_KEY" def test_no_bucket(): with mock_aws(): conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket="bucket1") index = MemoryStore("index") store = S3Store(index, "bucket2") with pytest.raises(RuntimeError, match=r".*Bucket not present.*"): store.connect() def test_force_reset(s3store): content = [ { "task_id": "mp-4", "data": "abc", s3store.last_updated_field: datetime.utcnow(), } ] s3store.connect(force_reset=True) s3store.update(content) assert s3store.count({"task_id": "mp-4"}) == 1 s3store.s3 = None s3store.connect() s3store.update(content) assert s3store.count({"task_id": "mp-4"}) == 1 s3store.close() def test_ssh_tunnel(s3store_with_tunnel): """This test will actually create a real tunnel to test the functionality. The tunnel will be set to `None` if the tunnel cannot be created. As a result, it becomes a test not testing the functionality of S3Store with the tunnel. """ content = [ { "task_id": "mp-4", "data": "abc", s3store_with_tunnel.last_updated_field: datetime.utcnow(), } ] s3store_with_tunnel.update(content) assert s3store_with_tunnel.count({"task_id": "mp-4"}) == 1 s3store_with_tunnel.close() def test_ssh_tunnel_2(): """ This test mocks the SSHTunnel behavior by creating a fake tunnel. The purpose is to check the behavior of the S3Store when the tunnel is not `None`. This complements the `test_ssh_tunnel` test above. """ class FakeTunnel: def __init__(self, *args, **kwargs): pass def start(self): pass def stop(self): pass def local_address(self): return "ADDRESS", "PORT" def get_store(): with mock_aws(): conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket="bucket1") index = MemoryStore("index", key="task_id") store = S3Store(index, "bucket1", key="task_id", ssh_tunnel=FakeTunnel()) store.connect() store._get_session() assert store._get_endpoint_url() == "http://ADDRESS:PORT" store.close() yield store get_store() def test_index_store_kwargs(mongostore): index = MongoStore("db", collection_name="index", key="task_id") store = S3Store(index, "bucket1", key="task_id", index_store_kwargs={"port": 12345}) assert store.index.port == 12345 maggma-0.70.0/tests/stores/test_azure.py000066400000000000000000000324541470132070100202520ustar00rootroot00000000000000""" Azure testing requires Azurite. It can be set up according to the instructions: https://github.com/Azure/Azurite With docker can be started by running: docker run -p 10000:10000 mcr.microsoft.com/azure-storage/azurite azurite-blob --blobHost 0.0.0.0 """ import time from contextlib import contextmanager from datetime import datetime import pytest from maggma.stores import AzureBlobStore, MemoryStore, MongoStore try: import azure.storage.blob as azure_blob from azure.storage.blob import BlobServiceClient except (ImportError, ModuleNotFoundError): azure_blob = None # type: ignore AZURITE_CONNECTION_STRING = ( "DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;" "AccountKey=" "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq" "/K1SZFPTOtr/KBHBeksoGMGw==;" "BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;" ) AZURITE_CONTAINER_NAME = "maggma-test-container" @pytest.fixture() def mongostore(): store = MongoStore("maggma_test", "test") store.connect() yield store store._collection.drop() # a context manager and not a fixture to handle multiple containers in the # same test @contextmanager def azurite_container(container_name=AZURITE_CONTAINER_NAME, create_container=True): if azure_blob is None: pytest.skip("azure-storage-blob is required to test AzureBlobStore") blob_service_client = BlobServiceClient.from_connection_string(AZURITE_CONNECTION_STRING) container_client = blob_service_client.get_container_client(container_name) if container_client.exists(): container_client.delete_container() if create_container: container_client.create_container() try: yield finally: if container_client.exists(): container_client.delete_container() @pytest.fixture() def blobstore(): with azurite_container(): index = MemoryStore("index", key="task_id") store = AzureBlobStore( index, container_name=AZURITE_CONTAINER_NAME, azure_client_info={"connection_string": AZURITE_CONNECTION_STRING}, ) store.connect() yield store @pytest.fixture() def blobstore_two_docs(blobstore): blobstore.update( [ { "task_id": "mp-1", "data": "asd", blobstore.last_updated_field: datetime.utcnow(), } ] ) blobstore.update( [ { "task_id": "mp-3", "data": "sdf", blobstore.last_updated_field: datetime.utcnow(), } ] ) return blobstore @pytest.fixture() def blobstore_w_subdir(): with azurite_container(): index = MemoryStore("index") store = AzureBlobStore( index, container_name=AZURITE_CONTAINER_NAME, azure_client_info={"connection_string": AZURITE_CONNECTION_STRING}, sub_dir="subdir1", ) store.connect() yield store @pytest.fixture() def blobstore_multi(blobstore): blobstore.workers = 4 return blobstore def test_keys(): with azurite_container(): index = MemoryStore("index", key=1) with pytest.raises(AssertionError, match=r"Since we are.*"): store = AzureBlobStore(index, AZURITE_CONTAINER_NAME, workers=4, key=1) index = MemoryStore("index", key="key1") with pytest.warns(UserWarning, match=r"The desired .*key.*$"): store = AzureBlobStore( index, AZURITE_CONTAINER_NAME, workers=4, key="key2", azure_client_info={"connection_string": AZURITE_CONNECTION_STRING}, ) store.connect() store.update({"key1": "mp-1", "data": "1234"}) with pytest.raises(KeyError): store.update({"key2": "mp-2", "data": "1234"}) assert store.key == store.index.key == "key1" def test_multi_update(blobstore_two_docs, blobstore_multi): data = [ { "task_id": str(j), "data": "DATA", blobstore_multi.last_updated_field: datetime.utcnow(), } for j in range(32) ] def fake_writing(doc, search_keys): time.sleep(0.20) return {k: doc[k] for k in search_keys} blobstore_two_docs.write_doc_to_blob = fake_writing blobstore_multi.write_doc_to_blob = fake_writing start = time.time() blobstore_multi.update(data, key=["task_id"]) end = time.time() time_multi = end - start start = time.time() blobstore_two_docs.update(data, key=["task_id"]) end = time.time() time_single = end - start assert time_single > time_multi * (blobstore_multi.workers - 1) / (blobstore_two_docs.workers) def test_count(blobstore_two_docs): assert blobstore_two_docs.count() == 2 assert blobstore_two_docs.count({"task_id": "mp-3"}) == 1 def test_query(blobstore_two_docs): assert blobstore_two_docs.query_one(criteria={"task_id": "mp-2"}) is None assert blobstore_two_docs.query_one(criteria={"task_id": "mp-1"})["data"] == "asd" assert blobstore_two_docs.query_one(criteria={"task_id": "mp-3"})["data"] == "sdf" assert len(list(blobstore_two_docs.query())) == 2 def test_update(blobstore_two_docs): blobstore_two_docs.update( [ { "task_id": "mp-199999", "data": "asd", blobstore_two_docs.last_updated_field: datetime.utcnow(), } ] ) assert blobstore_two_docs.query_one({"task_id": "mp-199999"}) is not None blobstore_two_docs.compress = True blobstore_two_docs.update([{"task_id": "mp-4", "data": "asd"}]) obj = blobstore_two_docs.index.query_one({"task_id": "mp-4"}) assert obj["compression"] == "zlib" assert obj["obj_hash"] == "be74de5ac71f00ec9e96441a3c325b0592c07f4c" assert blobstore_two_docs.query_one({"task_id": "mp-4"})["data"] == "asd" def test_rebuild_meta_from_index(blobstore_two_docs): blobstore_two_docs.update([{"task_id": "mp-2", "data": "asd"}]) blobstore_two_docs.index.update({"task_id": "mp-2", "add_meta": "hello"}) blobstore_two_docs.rebuild_metadata_from_index() print(list(blobstore_two_docs.container.list_blobs())) blob_client = blobstore_two_docs.container.get_blob_client("mp-2") metadata = blob_client.get_blob_properties()["metadata"] assert metadata["add_meta"] == "hello" def test_rebuild_index(blobstore_two_docs): blobstore_two_docs.update([{"task_id": "mp-2", "data": "asd"}]) assert ( blobstore_two_docs.index.query_one({"task_id": "mp-2"})["obj_hash"] == "a69fe0c2cca3a3384c2b1d2f476972704f179741" ) blobstore_two_docs.index.remove_docs({}) assert blobstore_two_docs.index.query_one({"task_id": "mp-2"}) is None blobstore_two_docs.rebuild_index_from_blob_data() assert ( blobstore_two_docs.index.query_one({"task_id": "mp-2"})["obj_hash"] == "a69fe0c2cca3a3384c2b1d2f476972704f179741" ) def tests_msonable_read_write(blobstore_two_docs): dd = blobstore_two_docs.as_dict() blobstore_two_docs.update([{"task_id": "mp-2", "data": dd}]) res = blobstore_two_docs.query_one({"task_id": "mp-2"}) assert res["data"]["@module"] == "maggma.stores.azure" def test_remove(blobstore_two_docs): # At time of writing, Azurite does not support the delete_blobs operation # of the ContainerClient. See https://github.com/Azure/Azurite/issues/1809 pytest.skip("Azurite currently does not support delete_blobs") def objects_in_bucket(key): objs = list(blobstore_two_docs.container.list_blobs()) return key in [o["name"] for o in objs] blobstore_two_docs.update([{"task_id": "mp-2", "data": "asd"}]) blobstore_two_docs.update([{"task_id": "mp-4", "data": "asd"}]) blobstore_two_docs.update({"task_id": "mp-5", "data": "aaa"}) assert blobstore_two_docs.query_one({"task_id": "mp-2"}) is not None assert blobstore_two_docs.query_one({"task_id": "mp-4"}) is not None assert objects_in_bucket("mp-2") assert objects_in_bucket("mp-4") blobstore_two_docs.remove_docs({"task_id": "mp-2"}) blobstore_two_docs.remove_docs({"task_id": "mp-4"}, remove_blob_object=True) assert objects_in_bucket("mp-2") assert not objects_in_bucket("mp-4") assert blobstore_two_docs.query_one({"task_id": "mp-5"}) is not None def test_close(blobstore_two_docs): list(blobstore_two_docs.query()) blobstore_two_docs.close() with pytest.raises(RuntimeError): list(blobstore_two_docs.query()) def test_bad_import(mocker): mocker.patch("maggma.stores.azure.azure_blob", None) index = MemoryStore("index") with pytest.raises(RuntimeError): AzureBlobStore(index, "bucket1") def test_eq(mongostore, blobstore_two_docs): assert blobstore_two_docs == blobstore_two_docs assert blobstore_two_docs != mongostore def test_count_subdir(blobstore_w_subdir): blobstore_w_subdir.update([{"task_id": "mp-1", "data": "asd"}]) blobstore_w_subdir.update([{"task_id": "mp-2", "data": "asd"}]) assert blobstore_w_subdir.count() == 2 assert blobstore_w_subdir.count({"task_id": "mp-2"}) == 1 def test_subdir_field(blobstore_w_subdir): blobstore_w_subdir.update([{"task_id": "mp-1", "data": "asd"}]) blobstore_w_subdir.update([{"task_id": "mp-2", "data": "asd"}]) for cc in blobstore_w_subdir.index.query(): assert len(cc["sub_dir"]) > 0 assert cc["sub_dir"] == blobstore_w_subdir.sub_dir def test_remove_subdir(blobstore_w_subdir): blobstore_w_subdir.update([{"task_id": "mp-2", "data": "asd"}]) blobstore_w_subdir.update([{"task_id": "mp-4", "data": "asd"}]) assert blobstore_w_subdir.query_one({"task_id": "mp-2"}) is not None assert blobstore_w_subdir.query_one({"task_id": "mp-4"}) is not None blobstore_w_subdir.remove_docs({"task_id": "mp-2"}) assert blobstore_w_subdir.query_one({"task_id": "mp-2"}) is None assert blobstore_w_subdir.query_one({"task_id": "mp-4"}) is not None def test_searchable_fields(blobstore_two_docs): tic = datetime(2018, 4, 12, 16) data = [{"task_id": f"mp-{i}", "a": i, blobstore_two_docs.last_updated_field: tic} for i in range(4)] blobstore_two_docs.searchable_fields = ["task_id"] blobstore_two_docs.update(data, key="a") # This should only work if the searchable field was put into the index store assert set(blobstore_two_docs.distinct("task_id")) == { "mp-0", "mp-1", "mp-2", "mp-3", } def test_newer_in(blobstore): tic = datetime(2018, 4, 12, 16) tic2 = datetime.utcnow() name_old = AZURITE_CONTAINER_NAME name_new = AZURITE_CONTAINER_NAME + "-2" with azurite_container(name_old), azurite_container(name_new): index_old = MemoryStore("index_old") old_store = AzureBlobStore( index_old, name_old, azure_client_info={"connection_string": AZURITE_CONNECTION_STRING}, ) old_store.connect() old_store.update([{"task_id": "mp-1", "last_updated": tic}]) old_store.update([{"task_id": "mp-2", "last_updated": tic}]) index_new = MemoryStore("index_new") new_store = AzureBlobStore( index_new, name_new, azure_client_info={"connection_string": AZURITE_CONNECTION_STRING}, ) new_store.connect() new_store.update([{"task_id": "mp-1", "last_updated": tic2}]) new_store.update([{"task_id": "mp-2", "last_updated": tic2}]) assert len(old_store.newer_in(new_store)) == 2 assert len(new_store.newer_in(old_store)) == 0 assert len(old_store.newer_in(new_store.index)) == 2 assert len(new_store.newer_in(old_store.index)) == 0 def test_additional_metadata(blobstore_two_docs): tic = datetime(2018, 4, 12, 16) data = [{"task_id": f"mp-{i}", "a": i, blobstore_two_docs.last_updated_field: tic} for i in range(4)] blobstore_two_docs.update(data, key="a", additional_metadata="task_id") # This should only work if the searchable field was put into the index store assert set(blobstore_two_docs.distinct("task_id")) == { "mp-0", "mp-1", "mp-2", "mp-3", } def test_no_container(): with azurite_container(create_container=False): index = MemoryStore("index") store = AzureBlobStore( index, AZURITE_CONTAINER_NAME, azure_client_info={"connection_string": AZURITE_CONNECTION_STRING}, ) with pytest.raises(RuntimeError, match=r".*Container not present.*"): store.connect() # check that the store can create store = AzureBlobStore( index, AZURITE_CONTAINER_NAME, azure_client_info={"connection_string": AZURITE_CONNECTION_STRING}, create_container=True, ) store.connect() def test_name(blobstore): assert blobstore.name == f"container://{AZURITE_CONTAINER_NAME}" def test_ensure_index(blobstore_two_docs): assert blobstore_two_docs.ensure_index("task-id") def test_no_login(): with azurite_container(): index = MemoryStore("index") store = AzureBlobStore( index, AZURITE_CONTAINER_NAME, azure_client_info={}, ) with pytest.raises(RuntimeError, match=r".*Could not instantiate BlobServiceClient.*"): store.connect() maggma-0.70.0/tests/stores/test_compound_stores.py000066400000000000000000000151331470132070100223420ustar00rootroot00000000000000from datetime import datetime from itertools import chain import pytest from pydash import get from maggma.stores import ConcatStore, JointStore, MemoryStore, MongoStore @pytest.fixture() def mongostore(): store = MongoStore("magmma_test", "test") store.connect() yield store store._collection.drop() @pytest.fixture(scope="module") def jointstore_test1(): store = MongoStore("maggma_test", "test1") store.connect() yield store store._collection.drop() @pytest.fixture(scope="module") def jointstore_test2(): store = MongoStore("maggma_test", "test2") store.connect() yield store store._collection.drop() @pytest.fixture(scope="module") def jointstore(jointstore_test1, jointstore_test2): jointstore_test1.update( [ { "task_id": k, "my_prop": k + 1, "last_updated": datetime.utcnow(), "category": k // 5, } for k in range(10) ] ) jointstore_test2.update( [ { "task_id": 2 * k, "your_prop": k + 3, "last_updated": datetime.utcnow(), "category2": k // 3, } for k in range(5) ] ) store = JointStore("maggma_test", ["test1", "test2"]) store.connect() return store def test_joint_store_count(jointstore): assert jointstore.count() == 10 assert jointstore.count({"test2.category2": {"$exists": 1}}) == 5 def test_joint_store_query(jointstore): # Test query all docs = list(jointstore.query()) assert len(docs) == 10 docs_w_field = [d for d in docs if "test2" in d] assert len(docs_w_field) == 5 docs_w_field = sorted(docs_w_field, key=lambda x: x["task_id"]) assert docs_w_field[0]["test2"]["your_prop"] == 3 assert docs_w_field[0]["task_id"] == 0 assert docs_w_field[0]["my_prop"] == 1 def test_joint_store_query_one(jointstore): doc = jointstore.query_one() assert doc["my_prop"] == doc["task_id"] + 1 # Test limit properties doc = jointstore.query_one(properties=["test2", "task_id"]) assert doc["test2"]["your_prop"] == doc["task_id"] + 3 assert doc.get("my_prop") is None # Test criteria doc = jointstore.query_one(criteria={"task_id": {"$gte": 10}}) assert doc is None doc = jointstore.query_one(criteria={"test2.your_prop": {"$gt": 6}}) assert doc["task_id"] == 8 # Test merge_at_root jointstore.merge_at_root = True # Test merging is working properly doc = jointstore.query_one(criteria={"task_id": 2}) assert doc["my_prop"] == 3 assert doc["your_prop"] == 4 # Test merging is allowing for subsequent match doc = jointstore.query_one(criteria={"your_prop": {"$gt": 6}}) assert doc["task_id"] == 8 @pytest.mark.xfail(reason="key grouping appears to make lists") def test_joint_store_distinct(jointstore): your_prop = jointstore.distinct("test2.your_prop") assert set(your_prop) == {k + 3 for k in range(5)} my_prop = jointstore.distinct("my_prop") assert set(my_prop) == {k + 1 for k in range(10)} my_prop_cond = jointstore.distinct("my_prop", {"test2.your_prop": {"$gte": 5}}) assert set(my_prop_cond), {5, 7 == 9} def test_joint_store_last_updated(jointstore, jointstore_test1, jointstore_test2): test1 = jointstore_test1 test2 = jointstore_test2 doc = jointstore.query_one({"task_id": 0}) test1doc = test1.query_one({"task_id": 0}) test2doc = test2.query_one({"task_id": 0}) assert test1doc["last_updated"] == doc["last_updated"] assert test2doc["last_updated"] != doc["last_updated"] # Swap the two test2date = test2doc["last_updated"] test2doc["last_updated"] = test1doc["last_updated"] test1doc["last_updated"] = test2date test1.update([test1doc]) test2.update([test2doc]) doc = jointstore.query_one({"task_id": 0}) test1doc = test1.query_one({"task_id": 0}) test2doc = test2.query_one({"task_id": 0}) assert test1doc["last_updated"] == doc["last_updated"] assert test2doc["last_updated"] != doc["last_updated"] # Check also that still has a field if no task2 doc doc = jointstore.query_one({"task_id": 1}) assert doc["last_updated"] is not None def test_joint_store_groupby(jointstore): docs = list(jointstore.groupby("category")) assert len(docs[0][1]) == 5 assert len(docs[1][1]) == 5 docs = list(jointstore.groupby("test2.category2")) none_docs = next(d for d in docs if get(d[0], "test2.category2") == []) one_docs = next(d for d in docs if get(d[0], "test2.category2") == [1]) zero_docs = next(d for d in docs if get(d[0], "test2.category2") == [0]) assert len(none_docs[1]) == 5 assert len(one_docs[1]) == 2 assert len(zero_docs[1]) == 3 def test_joint_update(jointstore): with pytest.raises(NotImplementedError): jointstore.update({}) def test_joint_remove_docs(jointstore): with pytest.raises(NotImplementedError): jointstore.remove_docs({}) @pytest.fixture() def concat_store(): mem_stores = [MemoryStore(str(i)) for i in range(4)] store = ConcatStore(mem_stores) store.connect() index = 0 props = {i: str(i) for i in range(10)} for mem_store in mem_stores: docs = [{"task_id": i, "prop": props[i - index], "index": index} for i in range(index, index + 10)] index = index + 10 mem_store.update(docs) return store def test_concat_store_distinct(concat_store): docs = list(concat_store.distinct("task_id")) actual_docs = list(chain.from_iterable([store.distinct("task_id") for store in concat_store.stores])) assert len(docs) == len(actual_docs) assert set(docs) == set(actual_docs) def test_concat_store_groupby(concat_store): assert len(list(concat_store.groupby("index"))) == 4 assert len(list(concat_store.groupby("task_id"))) == 40 def test_concat_store_count(concat_store): assert concat_store.count() == 40 assert concat_store.count({"prop": "3"}) == 4 def test_concat_store_query(concat_store): docs = list(concat_store.query(properties=["task_id"])) t_ids = [d["task_id"] for d in docs] assert len(t_ids) == len(set(t_ids)) assert len(t_ids) == 40 def test_eq(mongostore, jointstore, concat_store): assert jointstore == jointstore assert concat_store == concat_store assert mongostore != jointstore assert mongostore != concat_store assert jointstore != concat_store def test_serialize(concat_store): d = concat_store.as_dict() new_concat_store = ConcatStore.from_dict(d) assert len(new_concat_store.stores) == len(concat_store.stores) maggma-0.70.0/tests/stores/test_file_store.py000066400000000000000000000331461470132070100212560ustar00rootroot00000000000000""" Tests for FileStore Desired behavior ---------------- - A FileStore is initialized on a directory containing files - The FileStore reads the files and populates itself with file metadata - If there is a FileStore.json present, its contents are read and merged with the file metadata - If there are records (file_id) in the JSON metadata that are not associated with a file on disk anymore, they are marked as orphans with 'orphan: True' and added to the store. - If there is no FileStore.json present - if read_only=False, the file is created - if read_only=True, no metadata is read in - if read_only=False, the update() method is enabled - if a FileStore is moved to a different location on disk (but all contents of the main directory are preserved), file_ids should not change and metadata should remain intact. """ import hashlib from datetime import datetime, timezone from distutils.dir_util import copy_tree from pathlib import Path import pytest from maggma.core import StoreError from maggma.stores.file_store import FileStore @pytest.fixture() def test_dir(tmp_path): module_dir = Path(__file__).resolve().parent test_dir = module_dir / ".." / "test_files" / "file_store_test" copy_tree(str(test_dir), str(tmp_path)) return tmp_path.resolve() def test_record_from_file(test_dir): """ Test functionality of _create_record_from_file """ fs = FileStore(test_dir, read_only=True) fs.connect() f = Path(test_dir / "calculation1" / "input.in") relative_path = f.relative_to(test_dir) digest = hashlib.md5() digest.update(str(relative_path).encode()) file_id = str(digest.hexdigest()) d = fs._create_record_from_file(f) assert d["name"] == "input.in" assert d["parent"] == "calculation1" assert d["path"] == test_dir / "calculation1" / "input.in" assert d["size"] == pytest.approx(90, abs=1) assert isinstance(d["hash"], str) assert d["file_id"] == file_id assert d["last_updated"] == datetime.fromtimestamp(f.stat().st_mtime, tz=timezone.utc) def test_newer_in_on_local_update(test_dir): """ Init a FileStore modify one of the files on disk Init another FileStore on the same directory confirm that one record shows up in newer_in """ fs = FileStore(test_dir, read_only=False) fs.connect() with open(test_dir / "calculation1" / "input.in", "w") as f: f.write("Ryan was here") fs2 = FileStore(test_dir, read_only=False) fs2.connect() assert fs2.last_updated > fs.last_updated assert ( fs2.query_one({"path": {"$regex": "calculation1/input.in"}})["last_updated"] > fs.query_one({"path": {"$regex": "calculation1/input.in"}})["last_updated"] ) assert len(fs.newer_in(fs2)) == 1 def test_max_depth(test_dir): """ test max_depth parameter NOTE this test only creates a single temporary directory, meaning that the JSON file created by the first FileStore.init() persists for the other tests. This creates the possibility of orphaned metadata. """ # default (None) should parse all 6 files fs = FileStore(test_dir, read_only=False) fs.connect() assert len(list(fs.query())) == 6 # 0 depth should parse 1 file fs = FileStore(test_dir, read_only=False, max_depth=0) fs.connect() assert len(list(fs.query())) == 1 # 1 depth should parse 5 files fs = FileStore(test_dir, read_only=False, max_depth=1) fs.connect() assert len(list(fs.query())) == 5 # 2 depth should parse 6 files fs = FileStore(test_dir, read_only=False, max_depth=2) fs.connect() assert len(list(fs.query())) == 6 def test_orphaned_metadata(test_dir): """ test behavior when orphaned metadata is found NOTE the design of this test exploits the fact that the test only creates a single temporary directory, meaning that the JSON file created by the first FileStore.init() persists for the other tests. """ # make a FileStore of all files and add metadata to all of them fs = FileStore(test_dir, read_only=False) fs.connect() data = list(fs.query()) for d in data: d.update({"tags": "Ryan was here"}) fs.update(data) assert len(list(fs.query())) == 6 assert len(list(fs.query({"tags": {"$exists": True}}))) == 6 # the orphan field should be populated for all documents assert len(list(fs.query({"orphan": {"$exists": True}}))) == 6 fs.close() # re-init the store with a different max_depth parameter # this will result in orphaned metadata # with include_orphans=True, this should be returned in queries fs = FileStore(test_dir, read_only=True, max_depth=1, include_orphans=True) with pytest.warns(UserWarning, match="Orphaned metadata was found in FileStore.json"): fs.connect() assert len(list(fs.query())) == 6 assert len(list(fs.query({"tags": {"$exists": True}}))) == 6 # all items, including orphans, should have a file_id and path_relative assert len(list(fs.query({"file_id": {"$exists": True}}))) == 6 assert len(list(fs.query({"path_relative": {"$exists": True}}))) == 6 assert len(list(fs.query({"orphan": True}))) == 1 fs.close() # re-init the store after renaming one of the files on disk # this will result in orphaned metadata # with include_orphans=False (default), that metadata should be # excluded from query results Path(test_dir / "calculation1" / "input.in").rename(test_dir / "calculation1" / "input_renamed.in") fs = FileStore(test_dir, read_only=True, include_orphans=False) with pytest.warns(UserWarning, match="Orphaned metadata was found in FileStore.json"): fs.connect() assert len(list(fs.query())) == 6 assert len(list(fs.query({"tags": {"$exists": True}}))) == 5 assert len(list(fs.query({"path": {"$exists": True}}))) == 6 # manually specifying orphan: True should still work assert len(list(fs.query({"orphan": True}))) == 1 fs.close() def test_store_files_moved(test_dir): """ test behavior when the directory that constitutes the FileStore is moved to a new location on disk """ # make a FileStore of all files and add metadata to all of them fs = FileStore(test_dir, read_only=False) fs.connect() data = list(fs.query()) for d in data: d.update({"tags": "Ryan was here"}) fs.update(data) # the orphan field should be populated for all documents, and False assert len(list(fs.query({"orphan": False}))) == 6 original_file_ids = {f["file_id"] for f in fs.query()} original_paths = {f["path"] for f in fs.query()} fs.close() # now copy the entire FileStore to a new directory and re-initialize copy_tree(test_dir, str(test_dir / "new_store_location")) fs = FileStore(test_dir / "new_store_location", read_only=False) fs.connect() assert len(list(fs.query({"orphan": False}))) == 6 assert {f["file_id"] for f in fs.query()} == original_file_ids # absolute paths should change to follow the FileStore assert {f["path"] for f in fs.query()} != original_paths for d in fs.query(properties=["path"]): assert str(d["path"]).startswith(str(fs.path)) def test_file_filters(test_dir): """ Make sure multiple patterns work correctly """ # here, we should get 2 input.in files and the file_2_levels_deep.json # the store's FileStore.json should be skipped even though .json is # in the file patterns fs = FileStore(test_dir, read_only=False, file_filters=["*.in", "*.json"]) fs.connect() assert len(list(fs.query())) == 3 def test_read_only(test_dir): """ Make sure nothing is written to a read-only FileStore and that documents cannot be deleted """ with pytest.warns(UserWarning, match="JSON file 'random.json' not found"): fs = FileStore(test_dir, read_only=True, json_name="random.json") fs.connect() assert not Path(test_dir / "random.json").exists() file_id = fs.query_one()["file_id"] with pytest.raises(StoreError, match="read-only"): fs.update({"file_id": file_id, "tags": "something"}) with pytest.raises(StoreError, match="read-only"): fs.remove_docs({}) def test_query(test_dir): """ File contents should be read unless file is too large size and path keys should not be returned unless explicitly requested querying on 'contents' should raise a warning contents should be empty if a file is too large empty properties kwarg should return contents, size, and path (along with everything else) """ fs = FileStore(test_dir, read_only=True) fs.connect() d = fs.query_one( {"name": "input.in", "parent": "calculation1"}, properties=["file_id", "contents"], ) assert not d.get("size") assert not d.get("path") assert d.get("file_id") assert d.get("contents") assert "This is the file named input.in" in d["contents"] d = fs.query_one( {"name": "input.in", "parent": "calculation1"}, properties=None, ) assert d.get("size") assert d.get("path") assert d.get("file_id") assert d.get("contents") with pytest.warns(UserWarning, match="'contents' is not a queryable field!"): fs.query_one({"contents": {"$regex": "input.in"}}) d = fs.query_one( {"name": "input.in", "parent": "calculation1"}, properties=["name", "contents"], contents_size_limit=50, ) assert d["contents"] == "File exceeds size limit of 50 bytes" assert d.get("name") def test_remove(test_dir): """ Test behavior of remove_docs() """ fs = FileStore(test_dir, read_only=False) fs.connect() paths = [d["path"] for d in fs.query()] with pytest.raises(StoreError, match="about to delete 6 items"): fs.remove_docs({}) fs.remove_docs({"name": "input.in"}, confirm=True) assert len(list(fs.query())) == 4 assert not Path.exists(test_dir / "calculation1" / "input.in") assert not Path.exists(test_dir / "calculation2" / "input.in") fs.remove_docs({}, confirm=True) assert not any(Path(p).exists() for p in paths) def test_metadata(test_dir): """ 1. init a FileStore 2. add some metadata to both 'input.in' files 3. confirm metadata written to .json 4. close the store, init a new one 5. confirm metadata correctly associated with the files """ fs = FileStore(test_dir, read_only=False, last_updated_field="last_change") fs.connect() query = {"name": "input.in", "parent": "calculation1"} key = next(iter(fs.query(query)))[fs.key] fs.add_metadata( { "metadata": {"experiment date": "2022-01-18"}, fs.last_updated_field: "this should not be here", }, query, ) # make sure metadata has been added to the item without removing other contents item_from_store = next(iter(fs.query({"file_id": key}))) assert item_from_store.get("name", False) assert item_from_store.get("metadata", False) fs.close() # only the updated item should have been written to the JSON, # and it should not contain any of the protected keys data = fs.metadata_store.read_json_file(fs.path / fs.json_name) assert len(data) == 1 item_from_file = next(d for d in data if d["file_id"] == key) assert item_from_file["metadata"] == {"experiment date": "2022-01-18"} assert not item_from_file.get("name") assert not item_from_file.get("path") assert not item_from_file.get(fs.last_updated_field) assert item_from_file.get("path_relative") # make sure metadata is preserved after reconnecting fs2 = FileStore(test_dir, read_only=True) fs2.connect() data = fs2.metadata_store.read_json_file(fs2.path / fs2.json_name) item_from_file = next(d for d in data if d["file_id"] == key) assert item_from_file["metadata"] == {"experiment date": "2022-01-18"} # make sure reconnected store properly merges in the metadata item_from_store = next(iter(fs2.query({"file_id": key}))) assert item_from_store["name"] == "input.in" assert item_from_store["parent"] == "calculation1" assert item_from_store.get("metadata") == {"experiment date": "2022-01-18"} fs2.close() # make sure reconnecting with read_only=False doesn't remove metadata from the JSON fs3 = FileStore(test_dir, read_only=False) fs3.connect() data = fs3.metadata_store.read_json_file(fs3.path / fs3.json_name) item_from_file = next(d for d in data if d["file_id"] == key) assert item_from_file["metadata"] == {"experiment date": "2022-01-18"} item_from_store = next(iter(fs3.query({"file_id": key}))) assert item_from_store["name"] == "input.in" assert item_from_store["parent"] == "calculation1" assert item_from_store.get("metadata") == {"experiment date": "2022-01-18"} fs3.close() # test automatic metadata assignment def add_data_from_name(d): return {"calc_name": d["name"][0:5]} fs4 = FileStore(test_dir, read_only=False) fs4.connect() # apply the auto function to all records fs4.add_metadata(auto_data=add_data_from_name) for d in fs4.query(): print(d) assert d.get("calc_name", False) == d["name"][0:5] def test_json_name(test_dir): """ Make sure custom .json name works """ fs = FileStore(test_dir, read_only=False, json_name="random.json") fs.connect() assert Path(test_dir / "random.json").exists() def test_this_dir(): """ Make sure connect() works when path is "." """ fs = FileStore(".") fs.connect() assert not fs.name.endswith(".") def test_encoding(): """ Make sure custom encoding works """ fs = FileStore(".", read_only=False, encoding="utf8") fs.connect() assert Path("FileStore.json").exists() maggma-0.70.0/tests/stores/test_gridfs.py000066400000000000000000000213461470132070100204000ustar00rootroot00000000000000import json import os from datetime import datetime import numpy as np import numpy.testing as nptu import pytest from pymongo.errors import ConfigurationError from maggma.core import StoreError from maggma.stores import GridFSStore, MongoStore from maggma.stores.gridfs import GridFSURIStore, files_collection_fields @pytest.fixture() def mongostore(): store = MongoStore("maggma_test", "test") store.connect() yield store store._collection.drop() @pytest.fixture() def gridfsstore(): store = GridFSStore("maggma_test", "test", key="task_id") store.connect() yield store store._files_collection.drop() store._chunks_collection.drop() def test_update(gridfsstore): data1 = np.random.rand(256) data2 = np.random.rand(256) tic = datetime(2018, 4, 12, 16) # Test metadata storage gridfsstore.update([{"task_id": "mp-1", "data": data1, gridfsstore.last_updated_field: tic}]) assert gridfsstore._files_collection.find_one({"metadata.task_id": "mp-1"}) is not None # Test storing data gridfsstore.update([{"task_id": "mp-1", "data": data2, gridfsstore.last_updated_field: tic}]) assert len(list(gridfsstore.query({"task_id": "mp-1"}))) == 1 assert "task_id" in gridfsstore.query_one({"task_id": "mp-1"}) nptu.assert_almost_equal(gridfsstore.query_one({"task_id": "mp-1"})["data"], data2, 7) # Test storing compressed data gridfsstore = GridFSStore("maggma_test", "test", key="task_id", compression=True) gridfsstore.connect() gridfsstore.update([{"task_id": "mp-1", "data": data1}]) assert gridfsstore._files_collection.find_one({"metadata.compression": "zlib"}) is not None nptu.assert_almost_equal(gridfsstore.query_one({"task_id": "mp-1"})["data"], data1, 7) def test_remove(gridfsstore): data1 = np.random.rand(256) data2 = np.random.rand(256) tic = datetime(2018, 4, 12, 16) gridfsstore.update([{"task_id": "mp-1", "data": data1, gridfsstore.last_updated_field: tic}]) gridfsstore.update([{"task_id": "mp-2", "data": data2, gridfsstore.last_updated_field: tic}]) assert gridfsstore.query_one(criteria={"task_id": "mp-1"}) assert gridfsstore.query_one(criteria={"task_id": "mp-2"}) gridfsstore.remove_docs({"task_id": "mp-1"}) assert gridfsstore.query_one(criteria={"task_id": "mp-1"}) is None assert gridfsstore.query_one(criteria={"task_id": "mp-2"}) def test_count(gridfsstore): data1 = np.random.rand(256) data2 = np.random.rand(256) tic = datetime(2018, 4, 12, 16) gridfsstore.update([{"task_id": "mp-1", "data": data1, gridfsstore.last_updated_field: tic}]) assert gridfsstore.count() == 1 gridfsstore.update([{"task_id": "mp-2", "data": data2, gridfsstore.last_updated_field: tic}]) assert gridfsstore.count() == 2 assert gridfsstore.count({"task_id": "mp-2"}) == 1 def test_query(gridfsstore): data1 = np.random.rand(256) data2 = np.random.rand(256) tic = datetime(2018, 4, 12, 16) gridfsstore.update([{"task_id": "mp-1", "data": data1, gridfsstore.last_updated_field: tic}]) gridfsstore.update([{"task_id": "mp-2", "data": data2, gridfsstore.last_updated_field: tic}]) doc = gridfsstore.query_one(criteria={"task_id": "mp-1"}) nptu.assert_almost_equal(doc["data"], data1, 7) doc = gridfsstore.query_one(criteria={"task_id": "mp-2"}) nptu.assert_almost_equal(doc["data"], data2, 7) assert gridfsstore.last_updated_field in doc assert gridfsstore.query_one(criteria={"task_id": "mp-3"}) is None def test_query_gridfs_file(gridfsstore): # put the data directly in gridfs, mimicking an existing gridfs collection # generated without the store gridfsstore._collection.put(b"hello world", task_id="mp-1") doc = gridfsstore.query_one() assert doc["data"].decode() == "hello world" assert doc[gridfsstore.key] == "mp-1" def test_last_updated(gridfsstore): data1 = np.random.rand(256) data2 = np.random.rand(256) tic = datetime(2018, 4, 12, 16) gridfsstore.update([{"task_id": "mp-1", "data": data1, gridfsstore.last_updated_field: tic}]) gridfsstore.update([{"task_id": "mp-2", "data": data2, gridfsstore.last_updated_field: tic}]) assert gridfsstore.last_updated == tic toc = datetime(2019, 6, 12, 16) gridfsstore.update([{"task_id": "mp-3", "data": data2, gridfsstore.last_updated_field: toc}]) assert gridfsstore.last_updated == toc tic = datetime(2017, 6, 12, 16) gridfsstore.update([{"task_id": "mp-4", "data": data2, gridfsstore.last_updated_field: tic}]) assert gridfsstore.last_updated == toc def test_groupby(gridfsstore): tic = datetime(2018, 4, 12, 16) for i in range(3): gridfsstore.update( [{"task_id": f"mp-{i}", "a": 1, gridfsstore.last_updated_field: tic}], key=["task_id", "a"], ) for i in range(3, 7): gridfsstore.update( [{"task_id": f"mp-{i}", "a": 2, gridfsstore.last_updated_field: tic}], key=["task_id", "a"], ) groups = list(gridfsstore.groupby("a")) assert len(groups) == 2 assert {g[0]["a"] for g in groups} == {1, 2} by_group = {} for group, docs in groups: by_group[group["a"]] = {d["task_id"] for d in docs} assert by_group[1] == {"mp-0", "mp-1", "mp-2"} assert by_group[2] == {"mp-3", "mp-4", "mp-5", "mp-6"} def test_distinct(gridfsstore): tic = datetime(2018, 4, 12, 16) for i in range(3): gridfsstore.update( [{"task_id": f"mp-{i}", "a": 1, gridfsstore.last_updated_field: tic}], key=["task_id", "a"], ) for i in range(3, 7): gridfsstore.update( [{"task_id": f"mp-{i}", "a": 2, gridfsstore.last_updated_field: tic}], key=["task_id", "a"], ) assert set(gridfsstore.distinct("a")) == {1, 2} def test_eq(mongostore, gridfsstore): assert gridfsstore == gridfsstore assert mongostore != gridfsstore def test_index(gridfsstore): assert gridfsstore.ensure_index("test_key") for field in files_collection_fields: assert gridfsstore.ensure_index(field) def test_gfs_metadata(gridfsstore): """ Ensure metadata is put back in the document """ tic = datetime(2018, 4, 12, 16) gridfsstore.ensure_metadata = True for i in range(3): data = { "a": 1, } metadata = {"task_id": f"mp-{i}", "a": 1, gridfsstore.last_updated_field: tic} data = json.dumps(data).encode("UTF-8") gridfsstore._collection.put(data, metadata=metadata) for d in gridfsstore.query(): assert "task_id" in d assert gridfsstore.last_updated_field in d def test_gridfsstore_from_launchpad_file(lp_file): ms = GridFSStore.from_launchpad_file(lp_file, collection_name="tmp") ms.connect() assert ms.name == "gridfs://localhost/maggma_tests/tmp" def test_searchable_fields(gridfsstore): tic = datetime(2018, 4, 12, 16) data = [{"task_id": f"mp-{i}", "a": i, gridfsstore.last_updated_field: tic} for i in range(3)] gridfsstore.searchable_fields = ["task_id"] gridfsstore.update(data, key="a") # This should only work if the searchable field was put into the index store assert set(gridfsstore.distinct("task_id")) == {"mp-0", "mp-1", "mp-2"} def test_additional_metadata(gridfsstore): tic = datetime(2018, 4, 12, 16) data = [{"task_id": f"mp-{i}", "a": i, gridfsstore.last_updated_field: tic} for i in range(3)] gridfsstore.update(data, key="a", additional_metadata="task_id") # This should only work if the searchable field was put into the index store assert set(gridfsstore.distinct("task_id")) == {"mp-0", "mp-1", "mp-2"} @pytest.mark.skipif( "mongodb+srv" not in os.environ.get("MONGODB_SRV_URI", ""), reason="requires special mongodb+srv URI", ) def test_gridfs_uri(): uri = os.environ["MONGODB_SRV_URI"] store = GridFSURIStore(uri, database="mp_core", collection_name="xas") store.connect() is_name = store.name is uri # This is try and keep the secret safe assert is_name def test_gridfs_uri_dbname_parse(): # test parsing dbname from uri uri_with_db = "mongodb://uuu:xxxx@host:27017/fake_db" store = GridFSURIStore(uri_with_db, "test") assert store.database == "fake_db" uri_with_db = "mongodb://uuu:xxxx@host:27017/fake_db" store = GridFSURIStore(uri_with_db, "test", database="fake_db2") assert store.database == "fake_db2" uri_with_db = "mongodb://uuu:xxxx@host:27017" with pytest.raises(ConfigurationError): GridFSURIStore(uri_with_db, "test") def test_close(gridfsstore): assert gridfsstore.query_one() is None gridfsstore.close() with pytest.raises(StoreError): gridfsstore.query_one() # reconnect to allow the drop of the collection in the fixture gridfsstore.connect() maggma-0.70.0/tests/stores/test_mongolike.py000066400000000000000000000503501470132070100211030ustar00rootroot00000000000000import os import shutil from datetime import datetime from pathlib import Path from unittest import mock import mongomock.collection import orjson import pymongo.collection import pytest from bson.objectid import ObjectId from monty.tempfile import ScratchDir from pymongo.errors import ConfigurationError, DocumentTooLarge, OperationFailure from maggma.core import StoreError from maggma.stores import JSONStore, MemoryStore, MongoStore, MongoURIStore, MontyStore from maggma.validators import JSONSchemaValidator @pytest.fixture() def mongostore(): store = MongoStore( database="maggma_test", collection_name="test", ) store.connect() yield store store._collection.drop() @pytest.fixture() def montystore(tmp_dir): store = MontyStore("maggma_test") store.connect() return store @pytest.fixture() def memorystore(): store = MemoryStore() store.connect() return store @pytest.fixture() def jsonstore(test_dir): files = [] for f in ["a.json", "b.json"]: files.append(test_dir / "test_set" / f) return JSONStore(files) @pytest.mark.xfail(raises=StoreError) def test_mongostore_connect_error(): mongostore = MongoStore("maggma_test", "test") mongostore.count() def test_mongostore_connect_reconnect(): mongostore = MongoStore("maggma_test", "test") assert mongostore._coll is None mongostore.connect() assert isinstance(mongostore._collection, pymongo.collection.Collection) mongostore.close() assert mongostore._coll is None mongostore.connect() def test_mongostore_query(mongostore): mongostore._collection.insert_one({"a": 1, "b": 2, "c": 3}) assert mongostore.query_one(properties=["a"])["a"] == 1 assert mongostore.query_one(properties=["a"])["a"] == 1 assert mongostore.query_one(properties=["b"])["b"] == 2 assert mongostore.query_one(properties=["c"])["c"] == 3 def test_mongostore_count(mongostore): mongostore._collection.insert_one({"a": 1, "b": 2, "c": 3}) assert mongostore.count() == 1 mongostore._collection.insert_one({"aa": 1, "b": 2, "c": 3}) assert mongostore.count() == 2 assert mongostore.count({"a": 1}) == 1 def test_mongostore_distinct(mongostore): mongostore._collection.insert_one({"a": 1, "b": 2, "c": 3}) mongostore._collection.insert_one({"a": 4, "d": 5, "e": 6, "g": {"h": 1}}) assert set(mongostore.distinct("a")) == {1, 4} # Test list distinct functionality mongostore._collection.insert_one({"a": 4, "d": 6, "e": 7}) mongostore._collection.insert_one({"a": 4, "d": 6, "g": {"h": 2}}) # Test distinct subdocument functionality ghs = mongostore.distinct("g.h") assert set(ghs) == {1, 2} # Test when key doesn't exist assert mongostore.distinct("blue") == [] # Test when null is a value mongostore._collection.insert_one({"i": None}) assert mongostore.distinct("i") == [None] # Test to make sure DocumentTooLarge errors get dealt with properly using built in distinct mongostore._collection.insert_many([{"key": [f"mp-{i}"]} for i in range(1000000)]) vals = mongostore.distinct("key") # Test to make sure distinct on array field is unraveled when using manual distinct assert len(vals) == len(list(range(1000000))) assert all(isinstance(v, str) for v in vals) # Test to make sure manual distinct uses the criteria query mongostore._collection.insert_many([{"key": f"mp-{i}", "a": 2} for i in range(1000001, 2000001)]) vals = mongostore.distinct("key", {"a": 2}) assert len(vals) == len(list(range(1000001, 2000001))) def test_mongostore_update(mongostore): mongostore.update({"e": 6, "d": 4}, key="e") assert mongostore.query_one(criteria={"d": {"$exists": 1}}, properties=["d"])["d"] == 4 mongostore.update([{"e": 7, "d": 8, "f": 9}], key=["d", "f"]) assert mongostore.query_one(criteria={"d": 8, "f": 9}, properties=["e"])["e"] == 7 mongostore.update([{"e": 11, "d": 8, "f": 9}], key=["d", "f"]) assert mongostore.query_one(criteria={"d": 8, "f": 9}, properties=["e"])["e"] == 11 test_schema = { "type": "object", "properties": {"e": {"type": "integer"}}, "required": ["e"], } mongostore.validator = JSONSchemaValidator(schema=test_schema) mongostore.update({"e": 100, "d": 3}, key="e") # Continue to update doc when validator is not set to strict mode mongostore.update({"e": "abc", "d": 3}, key="e") # ensure safe_update works to not throw DocumentTooLarge errors large_doc = {f"mp-{i}": f"mp-{i}" for i in range(1000000)} large_doc["e"] = 999 with pytest.raises((OperationFailure, DocumentTooLarge)): mongostore.update([large_doc, {"e": 1001}], key="e") mongostore.safe_update = True mongostore.update([large_doc, {"e": 1001}], key="e") assert mongostore.query_one({"e": 1001}) is not None def test_mongostore_groupby(mongostore): mongostore.update( [ {"e": 7, "d": 9, "f": 9}, {"e": 7, "d": 9, "f": 10}, {"e": 8, "d": 9, "f": 11}, {"e": 9, "d": 10, "f": 12}, ], key="f", ) data = list(mongostore.groupby("d")) assert len(data) == 2 grouped_by_9 = next(g[1] for g in data if g[0]["d"] == 9) assert len(grouped_by_9) == 3 grouped_by_10 = next(g[1] for g in data if g[0]["d"] == 10) assert len(grouped_by_10) == 1 data = list(mongostore.groupby(["e", "d"])) assert len(data) == 3 def test_mongostore_remove_docs(mongostore): mongostore._collection.insert_one({"a": 1, "b": 2, "c": 3}) mongostore._collection.insert_one({"a": 4, "d": 5, "e": 6, "g": {"h": 1}}) mongostore.remove_docs({"a": 1}) assert len(list(mongostore.query({"a": 4}))) == 1 assert len(list(mongostore.query({"a": 1}))) == 0 def test_mongostore_from_db_file(mongostore, db_json): ms = MongoStore.from_db_file(db_json) ms.connect() assert ms._collection.full_name == "maggma_tests.tmp" def test_mongostore_from_launchpad_file(lp_file): ms = MongoStore.from_launchpad_file(lp_file, collection_name="tmp") ms.connect() assert ms._collection.full_name == "maggma_tests.tmp" def test_mongostore_from_collection(mongostore, db_json): ms = MongoStore.from_db_file(db_json) ms.connect() other_ms = MongoStore.from_collection(ms._collection) assert ms._coll.full_name == other_ms._collection.full_name assert ms.database == other_ms.database def test_mongostore_name(mongostore): assert mongostore.name == "mongo://localhost/maggma_test/test" def test_ensure_index(mongostore): assert mongostore.ensure_index("test_key") # TODO: How to check for exception? def test_mongostore_last_updated(mongostore): assert mongostore.last_updated == datetime.min start_time = datetime.utcnow() mongostore._collection.insert_one({mongostore.key: 1, "a": 1}) with pytest.raises(StoreError) as cm: # noqa: PT012 mongostore.last_updated # noqa: B018 assert cm.match(mongostore.last_updated_field) mongostore.update([{mongostore.key: 1, "a": 1, mongostore.last_updated_field: datetime.utcnow()}]) assert mongostore.last_updated > start_time def test_mongostore_newer_in(mongostore): target = MongoStore("maggma_test", "test_target") target.connect() # make sure docs are newer in mongostore then target and check updated_keys target.update([{mongostore.key: i, mongostore.last_updated_field: datetime.utcnow()} for i in range(10)]) # Update docs in source mongostore.update([{mongostore.key: i, mongostore.last_updated_field: datetime.utcnow()} for i in range(10)]) assert len(target.newer_in(mongostore)) == 10 assert len(target.newer_in(mongostore, exhaustive=True)) == 10 assert len(mongostore.newer_in(target)) == 0 target._collection.drop() # Memory store tests def test_memory_store_connect(): memorystore = MemoryStore() assert memorystore._coll is None memorystore.connect() assert isinstance(memorystore._collection, mongomock.collection.Collection) def test_groupby(memorystore): memorystore.update( [ {"e": 7, "d": 9, "f": 9}, {"e": 7, "d": 9, "f": 10}, {"e": 8, "d": 9, "f": 11}, {"e": 9, "d": 10, "f": 12}, ], key="f", ) data = list(memorystore.groupby("d", properties={"e": 1, "f": 1})) assert len(data) == 2 grouped_by_9 = next(g[1] for g in data if g[0]["d"] == 9) assert len(grouped_by_9) == 3 assert all(d.get("f", False) for d in grouped_by_9) assert all(d.get("e", False) for d in grouped_by_9) grouped_by_10 = next(g[1] for g in data if g[0]["d"] == 10) assert len(grouped_by_10) == 1 data = list(memorystore.groupby(["e", "d"])) assert len(data) == 3 memorystore.update( [ {"e": {"d": 9}, "f": 9}, {"e": {"d": 9}, "f": 10}, {"e": {"d": 9}, "f": 11}, {"e": {"d": 10}, "f": 12}, ], key="f", ) data = list(memorystore.groupby("e.d", properties=["f"])) assert len(data) == 2 assert data[0][1][0].get("f", False) # Monty store tests def test_monty_store_connect(tmp_dir): montystore = MontyStore(collection_name="my_collection") assert montystore._coll is None montystore.connect() assert montystore._collection is not None assert montystore.name is not None # check that the kwargs work with ScratchDir("."): store = MontyStore("my_results", database_name="NotNamedDB") store.connect() store.update({"test": {"cow": "moo"}}, key="test") store.close() assert Path("NotNamedDB/my_results.collection").exists() def test_monty_store_groupby(montystore): montystore.update( [ {"e": 7, "d": 9, "f": 9}, {"e": 7, "d": 9, "f": 10}, {"e": 8, "d": 9, "f": 11}, {"e": 9, "d": 10, "f": 12}, ], key="f", ) data = list(montystore.groupby("d")) assert len(data) == 2 grouped_by_9 = next(g[1] for g in data if g[0]["d"] == 9) assert len(grouped_by_9) == 3 grouped_by_10 = next(g[1] for g in data if g[0]["d"] == 10) assert len(grouped_by_10) == 1 data = list(montystore.groupby(["e", "d"])) assert len(data) == 3 montystore.update( [ {"e": {"d": 9}, "f": 9}, {"e": {"d": 9}, "f": 10}, {"e": {"d": 9}, "f": 11}, {"e": {"d": 10}, "f": 12}, ], key="f", ) data = list(montystore.groupby("e.d")) assert len(data) == 2 def test_monty_store_query(montystore): montystore._collection.insert_one({"a": 1, "b": 2, "c": 3}) assert montystore.query_one(properties=["a"])["a"] == 1 assert montystore.query_one(properties=["a"])["a"] == 1 assert montystore.query_one(properties=["b"])["b"] == 2 assert montystore.query_one(properties=["c"])["c"] == 3 def test_monty_store_count(montystore): montystore._collection.insert_one({"a": 1, "b": 2, "c": 3}) assert montystore.count() == 1 montystore._collection.insert_one({"aa": 1, "b": 2, "c": 3}) assert montystore.count() == 2 assert montystore.count({"a": 1}) == 1 def test_monty_store_distinct(montystore): montystore._collection.insert_one({"a": 1, "b": 2, "c": 3}) montystore._collection.insert_one({"a": 4, "d": 5, "e": 6, "g": {"h": 1}}) assert set(montystore.distinct("a")) == {1, 4} # Test list distinct functionality montystore._collection.insert_one({"a": 4, "d": 6, "e": 7}) montystore._collection.insert_one({"a": 4, "d": 6, "g": {"h": 2}}) # Test distinct subdocument functionality ghs = montystore.distinct("g.h") assert set(ghs) == {1, 2} # Test when key doesn't exist assert montystore.distinct("blue") == [] # Test when null is a value montystore._collection.insert_one({"i": None}) assert montystore.distinct("i") == [None] def test_monty_store_update(montystore): montystore.update({"e": 6, "d": 4}, key="e") assert montystore.query_one(criteria={"d": {"$exists": 1}}, properties=["d"])["d"] == 4 montystore.update([{"e": 7, "d": 8, "f": 9}], key=["d", "f"]) assert montystore.query_one(criteria={"d": 8, "f": 9}, properties=["e"])["e"] == 7 montystore.update([{"e": 11, "d": 8, "f": 9}], key=["d", "f"]) assert montystore.query_one(criteria={"d": 8, "f": 9}, properties=["e"])["e"] == 11 test_schema = { "type": "object", "properties": {"e": {"type": "integer"}}, "required": ["e"], } montystore.validator = JSONSchemaValidator(schema=test_schema) montystore.update({"e": 100, "d": 3}, key="e") # Continue to update doc when validator is not set to strict mode montystore.update({"e": "abc", "d": 3}, key="e") def test_monty_store_remove_docs(montystore): montystore._collection.insert_one({"a": 1, "b": 2, "c": 3}) montystore._collection.insert_one({"a": 4, "d": 5, "e": 6, "g": {"h": 1}}) montystore.remove_docs({"a": 1}) assert len(list(montystore.query({"a": 4}))) == 1 assert len(list(montystore.query({"a": 1}))) == 0 def test_monty_store_last_updated(montystore): assert montystore.last_updated == datetime.min start_time = datetime.utcnow() montystore._collection.insert_one({montystore.key: 1, "a": 1}) with pytest.raises(StoreError) as cm: # noqa: PT012 montystore.last_updated # noqa: B018 assert cm.match(montystore.last_updated_field) montystore.update([{montystore.key: 1, "a": 1, montystore.last_updated_field: datetime.utcnow()}]) assert montystore.last_updated > start_time def test_json_store_load(jsonstore, test_dir): jsonstore.connect() assert len(list(jsonstore.query())) == 20 jsonstore = JSONStore(test_dir / "test_set" / "c.json.gz") jsonstore.connect() assert len(list(jsonstore.query())) == 20 # test with non-default encoding jsonstore = JSONStore(test_dir / "test_set" / "c.json.gz", encoding="utf8") jsonstore.connect() assert len(list(jsonstore.query())) == 20 # confirm descriptive error raised if you get a KeyError jsonstore = JSONStore(test_dir / "test_set" / "c.json.gz", key="random_key") with pytest.raises(KeyError, match="Key field 'random_key' not found"): jsonstore.connect() # if the .json does not exist, it should be created with pytest.warns(DeprecationWarning, match="file_writable is deprecated"): jsonstore = JSONStore("a.json", file_writable=False) assert jsonstore.read_only is True # test loading an extended JSON file exported from MongoDB js2 = JSONStore(test_dir / "test_set" / "extended_json.json") js2.connect() assert js2.count() == 1 assert js2.query_one()["_id"] == ObjectId("64ebee18bd0b1265fe418be2") def test_json_store_writeable(test_dir): with ScratchDir("."): # if the .json does not exist, it should be created jsonstore = JSONStore("a.json", read_only=False) jsonstore.connect() assert Path("a.json").exists() # confirm RunTimeError with multiple paths with pytest.raises(RuntimeError, match="multiple JSON"): jsonstore = JSONStore(["a.json", "d.json"], read_only=False) shutil.copy(test_dir / "test_set" / "d.json", ".") jsonstore = JSONStore("d.json", read_only=False) jsonstore.connect() assert jsonstore.count() == 2 jsonstore.update({"new": "hello", "task_id": 2}) assert jsonstore.count() == 3 jsonstore.close() # repeat the above with the deprecated file_writable kwarg # if the .json does not exist, it should be created with pytest.warns(UserWarning, match="Received conflicting keyword arguments"): jsonstore = JSONStore("a.json", file_writable=True) assert jsonstore.read_only is False assert Path("a.json").exists() jsonstore.connect() # confirm RunTimeError with multiple paths with pytest.raises(RuntimeError, match="multiple JSON"): jsonstore = JSONStore(["a.json", "d.json"], file_writable=True) shutil.copy(test_dir / "test_set" / "d.json", ".") jsonstore = JSONStore("d.json", file_writable=True) jsonstore.connect() assert jsonstore.count() == 2 jsonstore.update({"new": "hello", "task_id": 2}) assert jsonstore.count() == 3 jsonstore.close() jsonstore = JSONStore("d.json", file_writable=True) jsonstore.connect() assert jsonstore.count() == 3 jsonstore.remove_docs({"a": 5}) assert jsonstore.count() == 2 jsonstore.close() jsonstore = JSONStore("d.json", file_writable=True) jsonstore.connect() assert jsonstore.count() == 2 jsonstore.close() with mock.patch("maggma.stores.JSONStore.update_json_file") as update_json_file_mock: jsonstore = JSONStore("d.json", file_writable=False) jsonstore.connect() jsonstore.update({"new": "hello", "task_id": 5}) assert jsonstore.count() == 3 jsonstore.close() update_json_file_mock.assert_not_called() with mock.patch("maggma.stores.JSONStore.update_json_file") as update_json_file_mock: jsonstore = JSONStore("d.json", file_writable=False) jsonstore.connect() jsonstore.remove_docs({"task_id": 5}) assert jsonstore.count() == 2 jsonstore.close() update_json_file_mock.assert_not_called() def test_jsonstore_orjson_options(test_dir): class SubFloat(float): pass with ScratchDir("."): jsonstore = JSONStore("d.json", read_only=False) jsonstore.connect() with pytest.raises(orjson.JSONEncodeError): jsonstore.update({"wrong_field": SubFloat(1.1), "task_id": 3}) jsonstore.close() jsonstore = JSONStore( "a.json", read_only=False, serialization_option=None, serialization_default=lambda x: "test", ) jsonstore.connect() jsonstore.update({"wrong_field": SubFloat(1.1), "task_id": 3}) jsonstore.close() def test_jsonstore_last_updated(test_dir): # files = [] # for f in ["a.json", "b.json"]: # files.append(test_dir / "test_set" / f) with ScratchDir("."): # if the .json does not exist, it should be created jsonstore = JSONStore("a.json", read_only=False) jsonstore.connect() start_time = datetime.utcnow() # NOTE: mongo only stores datetime with ms precision (apparently), and that # can cause the test below to fail. So we add a wait here. import time time.sleep(0.1) jsonstore.update( [ { jsonstore.key: 1, "a": 1, jsonstore.last_updated_field: datetime.utcnow(), } ] ) # These lines ensure that the read_json_file method gets called after # last_updated is written to the .json file jsonstore.close() jsonstore.connect() assert jsonstore.last_updated > start_time def test_eq(mongostore, memorystore, jsonstore): assert mongostore == mongostore assert memorystore == memorystore assert jsonstore == jsonstore assert mongostore != memorystore assert mongostore != jsonstore assert memorystore != jsonstore @pytest.mark.skipif( "mongodb+srv" not in os.environ.get("MONGODB_SRV_URI", ""), reason="requires special mongodb+srv URI", ) def test_mongo_uri(): uri = os.environ["MONGODB_SRV_URI"] store = MongoURIStore(uri, database="mp_core", collection_name="xas") store.connect() is_name = store.name is uri # This is try and keep the secret safe assert is_name def test_mongo_uri_localhost(): store = MongoURIStore("mongodb://localhost:27017/mp_core", collection_name="xas") store.connect() def test_mongo_uri_dbname_parse(): # test parsing dbname from uri uri_with_db = "mongodb://uuu:xxxx@host:27017/fake_db" store = MongoURIStore(uri_with_db, "test") assert store.database == "fake_db" uri_with_db = "mongodb://uuu:xxxx@host:27017/fake_db" store = MongoURIStore(uri_with_db, "test", database="fake_db2") assert store.database == "fake_db2" uri_with_db = "mongodb://uuu:xxxx@host:27017" with pytest.raises(ConfigurationError): MongoURIStore(uri_with_db, "test") maggma-0.70.0/tests/stores/test_open_data.py000066400000000000000000000523611470132070100210550ustar00rootroot00000000000000import gzip import pickle from datetime import datetime, timedelta from io import BytesIO, StringIO import boto3 import jsonlines import pandas as pd import pytest from bson import json_util from moto import mock_aws from maggma.stores.open_data import OpenDataStore, PandasMemoryStore, S3IndexStore, TasksOpenDataStore pd.set_option("future.no_silent_downcasting", True) # PandasMemoryStore tests @pytest.fixture() def memstore(): store = PandasMemoryStore(key="task_id") store.update( pd.DataFrame( [ { store.key: "mp-1", store.last_updated_field: datetime.utcnow(), "data": "asd", "int_val": 1, }, { store.key: "mp-3", store.last_updated_field: datetime.utcnow(), "data": "sdf", "int_val": 3, }, ] ) ) return store def test_pdmems_pickle(memstore): sobj = pickle.dumps(memstore) dobj = pickle.loads(sobj) assert hash(dobj) == hash(memstore) assert dobj == memstore def test_pdmems_query(memstore): # bad criteria with pytest.raises(AttributeError, match=r".*only support query or is_in"): memstore.query(criteria={"boo": "hoo"}) with pytest.raises(AttributeError, match=r".*please just use one or the other"): memstore.query( criteria={ "query": f"{memstore.key} == 'mp-1'", "is_in": ("data", ["sdf", "fdr"]), "boo": "hoo", } ) # all pd = memstore.query() assert len(pd) == 2 # query criteria pd = memstore.query(criteria={"query": f"{memstore.key} == 'mp-1'"}) assert len(pd) == 1 assert pd[memstore.key].iloc[0] == "mp-1" assert pd["data"].iloc[0] == "asd" # is_in criteria pd = memstore.query(criteria={"is_in": ("data", ["sdf", "fdr"]), "boo": "hoo"}) assert len(pd) == 1 assert pd[memstore.key].iloc[0] == "mp-3" assert pd["data"].iloc[0] == "sdf" # properties pd = memstore.query(properties=[memstore.key, memstore.last_updated_field]) assert len(pd) == 2 assert len(pd[memstore.key]) == 2 with pytest.raises(KeyError): assert pd["data"] with pytest.raises(KeyError): memstore.query(properties=["fake"]) # sort pd = memstore.query(sort={memstore.key: -1}) assert len(pd) == 2 assert pd[memstore.key].iloc[0] == "mp-3" # skip pd = memstore.query(sort={memstore.key: -1}, skip=1) assert len(pd) == 1 assert pd[memstore.key].iloc[0] == "mp-1" # limit pd = memstore.query(sort={memstore.key: -1}, limit=1) assert len(pd) == 1 assert pd[memstore.key].iloc[0] == "mp-3" # all pd = memstore.query( criteria={ "is_in": ("data", ["sdf", "fdr", "asd"]), "boo": "hoo", }, properties=[memstore.key], sort={"data": -1}, skip=1, limit=1, ) assert len(pd) == 1 assert pd[memstore.key].iloc[0] == "mp-1" with pytest.raises(KeyError): assert pd["data"] def test_pdmems_count(memstore): assert memstore.count() == 2 assert memstore.count(criteria={"query": f"{memstore.key} == 'mp-1'"}) == 1 assert memstore.count(criteria={"is_in": ("data", ["sdf", "fdr"]), "boo": "hoo"}) == 1 assert PandasMemoryStore(key="task_id").count() == 0 @pytest.fixture() def memstore2(): store = PandasMemoryStore(key="task_id") store.update( pd.DataFrame( [ { store.key: "mp-1", store.last_updated_field: datetime.utcnow() - timedelta(hours=1), "data": "asd", }, { store.key: "mp-2", store.last_updated_field: datetime.utcnow() + timedelta(hours=1), "data": "asd", }, { store.key: "mp-3", store.last_updated_field: datetime.utcnow() + timedelta(hours=1), "data": "sdf", }, ] ) ) return store def test_pdmems_distinct(memstore2): assert len(memstore2.distinct(field=memstore2.key)) == len(memstore2._data) assert len(memstore2.distinct(field="data")) == 2 assert len(memstore2.distinct(field=memstore2.key, criteria={"query": "data == 'asd'"})) == 2 def test_pdmems_last_updated(memstore2): assert memstore2._data[memstore2.last_updated_field].iloc[2] == memstore2.last_updated store = PandasMemoryStore(key="task_id") assert store.last_updated == datetime.min def test_pdmems_newer_in(memstore, memstore2): s = memstore.newer_in(memstore2) assert len(s) == 2 assert "mp-2" in s.unique() assert "mp-3" in s.unique() s = memstore.newer_in(target=memstore2, criteria={"query": "data == 'asd'"}, exhaustive=True) assert len(s) == 1 assert "mp-2" in s.unique() with pytest.raises(AttributeError): memstore.newer_in(target=memstore2, criteria={"query": "data == 'asd'"}, exhaustive=False) def test_pdmems_update(memstore): df = pd.DataFrame( [ { memstore.key: "mp-1", memstore.last_updated_field: datetime.utcnow(), "data": "boo", "int_val": 1, } ] ) df2 = memstore.update(df) assert len(memstore._data) == 2 assert memstore.query(criteria={"query": f"{memstore.key} == 'mp-1'"})["data"].iloc[0] == "boo" assert memstore.query(criteria={"query": f"{memstore.key} == 'mp-1'"})["int_val"].iloc[0] == 1 assert df2.equals(df) df = pd.DataFrame( [ { memstore.key: "mp-2", memstore.last_updated_field: datetime.utcnow(), "data": "boo", "int_val": 2, } ] ) df2 = memstore.update(df) assert len(memstore._data) == 3 assert memstore.query(criteria={"query": f"{memstore.key} == 'mp-2'"})["data"].iloc[0] == "boo" assert memstore.query(criteria={"query": f"{memstore.key} == 'mp-2'"})["int_val"].iloc[0] == 2 assert df2.equals(df) @pytest.fixture() def s3indexstore(): data = [{"task_id": "mp-1", "last_updated": datetime.utcnow()}] with mock_aws(): conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket="bucket1") client = boto3.client("s3", region_name="us-east-1") string_io = StringIO() with jsonlines.Writer(string_io, dumps=json_util.dumps) as writer: for _, row in pd.DataFrame(data).iterrows(): writer.write(row.to_dict()) client.put_object( Bucket="bucket1", Body=BytesIO(string_io.getvalue().encode("utf-8")), Key="manifest.jsonl", ) store = S3IndexStore(collection_name="index", bucket="bucket1", key="task_id") store.connect() yield store def test_s3is_pickle(s3indexstore): sobj = pickle.dumps(s3indexstore) dobj = pickle.loads(sobj) assert hash(dobj) == hash(s3indexstore) assert dobj == s3indexstore def test_s3is_connect_retrieve_manifest(s3indexstore): assert s3indexstore.retrieve_manifest().equals(s3indexstore._data) with mock_aws(): with pytest.raises(s3indexstore.s3_client.exceptions.NoSuchBucket): S3IndexStore(collection_name="foo", bucket="bucket2", key="task_id").retrieve_manifest() with pytest.raises(RuntimeError): S3IndexStore(collection_name="foo", bucket="bucket2", key="task_id").connect() conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket="bucket2") s3is = S3IndexStore(collection_name="foo", bucket="bucket2", key="task_id") s3is.connect() assert s3is._data is None assert s3is.retrieve_manifest() is None assert s3is.count() == 0 def test_s3is_store_manifest(): with mock_aws(): conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket="bucket2") s3is = S3IndexStore(collection_name="foo", bucket="bucket2", key="task_id") s3is.connect() s3is.update(pd.DataFrame([{"task_id": "mp-2", "last_updated": "now"}])) s3is.store_manifest() df = s3is.retrieve_manifest() assert len(df) == 1 assert df.equals(s3is._data) s3is.update(pd.DataFrame([{"task_id": "mp-3", "last_updated": "later"}])) df = s3is.retrieve_manifest() assert not df.equals(s3is._data) def test_s3is_close(s3indexstore): s3indexstore.close() assert len(s3indexstore.query()) == 1 # actions auto-reconnect s3indexstore.update(pd.DataFrame([{"task_id": "mp-2", "last_updated": "now"}])) assert len(s3indexstore.query()) == 2 s3indexstore.close() assert len(s3indexstore.query()) == 2 # actions auto-reconnect s3indexstore.close() s3indexstore.connect() assert len(s3indexstore.query()) == 1 # explicit connect reloads manifest @pytest.fixture() def s3store(): with mock_aws(): conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket="bucket1") store = OpenDataStore( collection_name="index", bucket="bucket1", key="task_id", object_grouping=["group_level_two", "task_id"] ) store.connect() store.update( pd.DataFrame( [ { "task_id": "mp-1", "data": "asd", store.last_updated_field: datetime.utcnow(), "group": {"level_two": 4}, }, { "task_id": "mp-3", "data": "sdf", store.last_updated_field: datetime.utcnow(), "group": {"level_two": 4}, }, ] ) ) yield store @pytest.fixture() def s3store_w_subdir(): with mock_aws(): conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket="bucket1") conn.create_bucket(Bucket="bucket2") index = S3IndexStore(collection_name="index", bucket="bucket1", key="task_id") store = OpenDataStore( index=index, collection_name=index.collection_name, bucket="bucket2", key=index.key, prefix="subdir1", object_grouping=["data_foo"], ) store.connect() store.update( pd.DataFrame( [ { "task_id": "mp-1", "data": {"foo": "asd"}, store.last_updated_field: datetime.utcnow(), }, { "task_id": "mp-3", "data": {"foo": "sdf"}, store.last_updated_field: datetime.utcnow(), }, ] ) ) yield store def test_pickle(s3store): sobj = pickle.dumps(s3store) dobj = pickle.loads(sobj) assert hash(dobj) == hash(s3store) assert dobj == s3store assert len(dobj.query(criteria={"query": "task_id == 'mp-2'"})) == 0 assert dobj.query(criteria={"query": "task_id == 'mp-1'"})["data"].iloc[0] == "asd" def test_index_property(s3store, s3store_w_subdir): assert s3store.index != s3store assert s3store_w_subdir.index != s3store_w_subdir def test_read_doc_from_s3(): data = [ {"task_id": "mp-1", "last_updated": datetime.utcnow(), "group": "foo", "also_group": "boo"}, {"task_id": "mp-2", "last_updated": datetime.utcnow(), "group": "foo", "also_group": "boo"}, ] with mock_aws(): conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket="bucket1") client = boto3.client("s3", region_name="us-east-1") string_io = StringIO() with jsonlines.Writer(string_io, dumps=json_util.dumps) as writer: for _, row in pd.DataFrame(data).iterrows(): writer.write(row.to_dict()) client.put_object( Bucket="bucket1", Body=BytesIO(gzip.compress(string_io.getvalue().encode("utf-8"))), Key="group=foo/also_group=boo.jsonl.gz", ) store = OpenDataStore( collection_name="index", bucket="bucket1", key="task_id", object_grouping=["group", "also_group"] ) store.connect() df = store._read_doc_from_s3(file_id="group=foo/also_group=boo.jsonl.gz") assert len(df) == 2 assert store._get_full_key_path(index=df) == "group=foo/also_group=boo.jsonl.gz" assert (df["task_id"] == "mp-1").any() assert (df["task_id"] == "mp-2").any() @pytest.mark.xfail( reason="Known issue, the store is in a deprecated state, and in particular may be incompatible with numpy 2.0+" ) def test_update(s3store): assert len(s3store.index_data) == 2 s3store.update( pd.DataFrame( [ { "task_id": "mp-199999", "data": "asd", "group": {"level_two": 4}, s3store.last_updated_field: datetime.utcnow(), } ] ) ) assert len(s3store.index_data) == 3 with pytest.raises(KeyError): assert s3store.index_data.query("task_id == 'mp-199999'")["data"].iloc[0] == "asd" s3store.update( pd.DataFrame( [ { "task_id": "mp-199999", "data": "foo", "group": {"level_two": 4}, s3store.last_updated_field: datetime.utcnow(), } ] ) ) assert len(s3store.index_data) == 3 assert len(s3store.index_data.query("task_id == 'mp-199999'")) == 1 mp4 = [{"task_id": "mp-4", "data": "asd", "group": {"level_two": 4}, s3store.last_updated_field: datetime.utcnow()}] s3store.update(pd.DataFrame(mp4)) assert len(s3store.index_data) == 4 mp4_index = [{"task_id": "mp-4", "group_level_two": 4, s3store.last_updated_field: datetime.utcnow()}] assert s3store._get_full_key_path(pd.DataFrame(mp4_index)) == "group_level_two=4/task_id=mp-4.jsonl.gz" s3store.s3_client.head_object(Bucket=s3store.bucket, Key=s3store._get_full_key_path(pd.DataFrame(mp4_index))) def test_query(s3store): assert len(s3store.query(criteria={"query": "task_id == 'mp-2'"})) == 0 assert s3store.query(criteria={"query": "task_id == 'mp-1'"})["data"].iloc[0] == "asd" assert s3store.query(criteria={"query": "task_id == 'mp-3'"})["data"].iloc[0] == "sdf" assert s3store.query(criteria={"query": "task_id == 'mp-1'"}, properties=["task_id"])["task_id"].iloc[0] == "mp-1" assert ( s3store.query(criteria={"query": "task_id == 'mp-1'"}, properties=["task_id", "data"])["data"].iloc[0] == "asd" ) assert len(s3store.query()) == 2 # will use optimized search df = s3store.query(criteria={"query": "task_id == 'mp-1'"}, properties=["task_id"], criteria_fields=["task_id"]) assert len(df) == 1 assert df["task_id"].iloc[0] == "mp-1" with pytest.raises(KeyError): assert df["data"].iloc[0] == "asd" # will use optimized search df = s3store.query(properties=["task_id"], criteria_fields=[]) assert len(df) == 2 # will not use optimized search even with hints since data is not in the searchable_fields df = s3store.query(criteria={"query": "data == 'asd'"}, properties=["task_id"], criteria_fields=["data"]) assert len(df) == 1 assert df["task_id"].iloc[0] == "mp-1" with pytest.raises(KeyError): assert df["data"].iloc[0] == "asd" def test_rebuild_index_from_s3_data(s3store): data = [ {"task_id": "mp-2", "data": "asd", s3store.last_updated_field: datetime.utcnow(), "group": {"level_two": 4}} ] client = boto3.client("s3", region_name="us-east-1") string_io = StringIO() with jsonlines.Writer(string_io, dumps=json_util.dumps) as writer: for _, row in pd.DataFrame(data).iterrows(): writer.write(row.to_dict()) data = [ {"task_id": "mp-99", "data": "asd", s3store.last_updated_field: datetime.utcnow(), "group": {"level_two": 4}} ] string_io2 = StringIO() with jsonlines.Writer(string_io2, dumps=json_util.dumps) as writer: for _, row in pd.DataFrame(data).iterrows(): writer.write(row.to_dict()) client.put_object( Bucket="bucket1", Body=BytesIO(gzip.compress(string_io.getvalue().encode("utf-8"))), Key="group_level_two=4/task_id=mp-2.jsonl.gz", ) # creating file that should not be indexed to test that it gets skipped client.put_object( Bucket="bucket1", Body=BytesIO(gzip.compress(string_io2.getvalue().encode("utf-8"))), Key="task_id=mp-99.gz", ) assert len(s3store.index.index_data) == 2 index_docs = s3store.rebuild_index_from_s3_data() assert len(index_docs) == 3 assert len(s3store.index.index_data) == 3 for key in index_docs.columns: assert key == "task_id" or key == "last_updated" or key == "group_level_two" def test_rebuild_index_from_data(s3store): data = [ {"task_id": "mp-2", "data": "asd", s3store.last_updated_field: datetime.utcnow(), "group": {"level_two": 4}} ] index_docs = s3store.rebuild_index_from_data(pd.DataFrame(data)) assert len(index_docs) == 1 assert len(s3store.index.index_data) == 1 for key in index_docs.columns: assert key == "task_id" or key == "last_updated" or key == "group_level_two" def test_count_subdir(s3store_w_subdir): s3store_w_subdir.update( pd.DataFrame( [{"task_id": "mp-1", "data": {"foo": "asd"}, s3store_w_subdir.last_updated_field: datetime.utcnow()}] ) ) s3store_w_subdir.update( pd.DataFrame( [{"task_id": "mp-2", "data": {"foo": "asd"}, s3store_w_subdir.last_updated_field: datetime.utcnow()}] ) ) assert len(s3store_w_subdir.query()) == 3 assert len(s3store_w_subdir.query(criteria=None)) == 3 assert s3store_w_subdir.count() == 3 assert s3store_w_subdir.count({"query": "task_id == 'mp-2'"}) == 1 def test_subdir_storage(s3store_w_subdir): def objects_in_bucket(key): objs = s3store_w_subdir.s3_client.list_objects_v2(Bucket=s3store_w_subdir.bucket, Prefix=key) return key in [o["Key"] for o in objs["Contents"]] s3store_w_subdir.update( pd.DataFrame( [{"task_id": "mp-1", "data": {"foo": "asd"}, s3store_w_subdir.last_updated_field: datetime.utcnow()}] ) ) s3store_w_subdir.update( pd.DataFrame( [{"task_id": "mp-2", "data": {"foo": "asd"}, s3store_w_subdir.last_updated_field: datetime.utcnow()}] ) ) assert objects_in_bucket("subdir1/data_foo=asd.jsonl.gz") assert objects_in_bucket("subdir1/data_foo=sdf.jsonl.gz") def test_additional_metadata(s3store): tic = datetime(2018, 4, 12, 16) data = [{"task_id": f"mp-{i}", "a": i, s3store.last_updated_field: tic} for i in range(4)] with pytest.raises(TypeError): s3store.update(data, key="a", additional_metadata="task_id") def test_rebuild_index_from_s3_for_tasks_store(): data = [{"task_id": "mp-2", "data": "asd", "last_updated": datetime.utcnow(), "group": {"level_two": 4}}] with mock_aws(): conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket="bucket1") string_io = StringIO() with jsonlines.Writer(string_io, dumps=json_util.dumps) as writer: for _, row in pd.DataFrame(data).iterrows(): writer.write(row.to_dict()) client = boto3.client("s3", region_name="us-east-1") client.put_object( Bucket="bucket1", Body=BytesIO(gzip.compress(string_io.getvalue().encode("utf-8"))), Key="group_level_two=4/dt=some_random_data.jsonl.gz", ) store = TasksOpenDataStore( collection_name="index", bucket="bucket1", key="task_id", object_grouping=["group_level_two", "dt"] ) store.connect() index_docs = store.rebuild_index_from_s3_data() assert len(index_docs) == 1 assert len(store.index.index_data) == 1 for key in index_docs.columns: assert key == "task_id" or key == "last_updated" or key == "group_level_two" or key == "dt" assert index_docs["dt"].iloc[0] == "some_random_data" def test_no_update_for_tasks_store(): with mock_aws(): conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket="bucket1") store = TasksOpenDataStore( collection_name="index", bucket="bucket1", key="task_id", object_grouping=["group_level_two", "dt"] ) store.connect() with pytest.raises(NotImplementedError): store.update( pd.DataFrame( [ { "task_id": "mp-199999", "data": "foo", "group": {"level_two": 4}, store.last_updated_field: datetime.utcnow(), } ] ) ) maggma-0.70.0/tests/stores/test_shared_stores.py000066400000000000000000000222161470132070100217640ustar00rootroot00000000000000import pymongo import pytest from pymongo.errors import DocumentTooLarge, OperationFailure from maggma.stores import GridFSStore, MemoryStore, MongoStore from maggma.stores.shared_stores import MultiStore, StoreFacade from maggma.validators import JSONSchemaValidator @pytest.fixture() def mongostore(): store = MongoStore("maggma_test", "test") store.connect() yield store store._collection.drop() @pytest.fixture() def gridfsstore(): store = GridFSStore("maggma_test", "test", key="task_id") store.connect() yield store store._files_collection.drop() store._chunks_collection.drop() @pytest.fixture() def multistore(): return MultiStore() @pytest.fixture() def memorystore(): store = MemoryStore() store.connect() return store def test_add_stores(multistore, mongostore, gridfsstore): # Should be empty at the start assert multistore.count_stores() == 0 multistore.ensure_store(mongostore) assert multistore.count_stores() == 1 assert multistore.get_store_index(mongostore) == 0 # Attempting to reinsert this store should do nothing multistore.ensure_store(mongostore) assert multistore.count_stores() == 1 # Make a copy of the mongostore and it should still do nothing temp_mongostore = MongoStore.from_dict(mongostore.as_dict()) multistore.ensure_store(temp_mongostore) assert multistore.count_stores() == 1 assert multistore.get_store_index(temp_mongostore) == 0 # Add this copy again, but don't use ensure_store # This tests the case in which a prior thread added # the store, but this current process was already # waiting for the lock acquisition multistore.add_store(temp_mongostore) assert multistore.count_stores() == 1 # Add the GridFSStore to the MultiStore() multistore.ensure_store(gridfsstore) assert multistore.count_stores() == 2 assert multistore.get_store_index(gridfsstore) == 1 # Add something that isn't a store class DummyObject: def __init__(self, a: int): self.a = a with pytest.raises(TypeError): multistore.ensure_store(DummyObject(1)) def test_store_facade(multistore, mongostore, gridfsstore): StoreFacade(mongostore, multistore) assert multistore.count_stores() == 1 assert multistore.get_store_index(mongostore) == 0 StoreFacade(gridfsstore, multistore) assert multistore.count_stores() == 2 assert multistore.get_store_index(gridfsstore) == 1 def test_multistore_query(multistore, mongostore, memorystore): memorystore_facade = StoreFacade(memorystore, multistore) mongostore_facade = StoreFacade(mongostore, multistore) temp_mongostore_facade = StoreFacade(MongoStore.from_dict(mongostore.as_dict()), multistore) memorystore_facade._collection.insert_one({"a": 1, "b": 2, "c": 3}) assert memorystore_facade.query_one(properties=["a"])["a"] == 1 assert memorystore_facade.query_one(properties=["a"])["a"] == 1 assert memorystore_facade.query_one(properties=["b"])["b"] == 2 assert memorystore_facade.query_one(properties=["c"])["c"] == 3 mongostore_facade._collection.insert_one({"a": 4, "b": 5, "c": 6}) assert mongostore_facade.query_one(properties=["a"])["a"] == 4 assert mongostore_facade.query_one(properties=["a"])["a"] == 4 assert mongostore_facade.query_one(properties=["b"])["b"] == 5 assert mongostore_facade.query_one(properties=["c"])["c"] == 6 assert temp_mongostore_facade.query_one(properties=["a"])["a"] == 4 assert temp_mongostore_facade.query_one(properties=["a"])["a"] == 4 assert temp_mongostore_facade.query_one(properties=["b"])["b"] == 5 assert temp_mongostore_facade.query_one(properties=["c"])["c"] == 6 def test_multistore_count(multistore, mongostore, memorystore): memorystore_facade = StoreFacade(memorystore, multistore) memorystore_facade._collection.insert_one({"a": 1, "b": 2, "c": 3}) assert memorystore_facade.count() == 1 memorystore_facade._collection.insert_one({"aa": 1, "b": 2, "c": 3}) assert memorystore_facade.count() == 2 assert memorystore_facade.count({"a": 1}) == 1 def test_multistore_distinct(multistore, mongostore): mongostore_facade = StoreFacade(mongostore, multistore) mongostore_facade._collection.insert_one({"a": 1, "b": 2, "c": 3}) mongostore_facade._collection.insert_one({"a": 4, "d": 5, "e": 6, "g": {"h": 1}}) assert set(mongostore_facade.distinct("a")) == {1, 4} # Test list distinct functionality mongostore_facade._collection.insert_one({"a": 4, "d": 6, "e": 7}) mongostore_facade._collection.insert_one({"a": 4, "d": 6, "g": {"h": 2}}) # Test distinct subdocument functionality ghs = mongostore_facade.distinct("g.h") assert set(ghs) == {1, 2} # Test when key doesn't exist assert mongostore_facade.distinct("blue") == [] # Test when null is a value mongostore_facade._collection.insert_one({"i": None}) assert mongostore_facade.distinct("i") == [None] # Test to make sure DocumentTooLarge errors get dealt with properly using built in distinct mongostore_facade._collection.insert_many([{"key": [f"mp-{i}"]} for i in range(1000000)]) vals = mongostore_facade.distinct("key") # Test to make sure distinct on array field is unraveled when using manual distinct assert len(vals) == len(list(range(1000000))) assert all(isinstance(v, str) for v in vals) # Test to make sure manual distinct uses the criteria query mongostore_facade._collection.insert_many([{"key": f"mp-{i}", "a": 2} for i in range(1000001, 2000001)]) vals = mongostore_facade.distinct("key", {"a": 2}) assert len(vals) == len(list(range(1000001, 2000001))) def test_multistore_update(multistore, mongostore): mongostore_facade = StoreFacade(mongostore, multistore) mongostore_facade.update({"e": 6, "d": 4}, key="e") assert mongostore_facade.query_one(criteria={"d": {"$exists": 1}}, properties=["d"])["d"] == 4 mongostore_facade.update([{"e": 7, "d": 8, "f": 9}], key=["d", "f"]) assert mongostore_facade.query_one(criteria={"d": 8, "f": 9}, properties=["e"])["e"] == 7 mongostore_facade.update([{"e": 11, "d": 8, "f": 9}], key=["d", "f"]) assert mongostore_facade.query_one(criteria={"d": 8, "f": 9}, properties=["e"])["e"] == 11 test_schema = { "type": "object", "properties": {"e": {"type": "integer"}}, "required": ["e"], } mongostore_facade.validator = JSONSchemaValidator(schema=test_schema) mongostore_facade.update({"e": 100, "d": 3}, key="e") # Continue to update doc when validator is not set to strict mode mongostore_facade.update({"e": "abc", "d": 3}, key="e") # ensure safe_update works to not throw DocumentTooLarge errors large_doc = {f"mp-{i}": f"mp-{i}" for i in range(1000000)} large_doc["e"] = 999 with pytest.raises((OperationFailure, DocumentTooLarge)): mongostore_facade.update([large_doc, {"e": 1001}], key="e") mongostore_facade.safe_update = True assert mongostore_facade.safe_update is True mongostore_facade.update([large_doc, {"e": 1001}], key="e") assert mongostore_facade.query_one({"e": 1001}) is not None def test_multistore_groupby(multistore, mongostore): mongostore_facade = StoreFacade(mongostore, multistore) mongostore_facade.update( [ {"e": 7, "d": 9, "f": 9}, {"e": 7, "d": 9, "f": 10}, {"e": 8, "d": 9, "f": 11}, {"e": 9, "d": 10, "f": 12}, ], key="f", ) data = list(mongostore_facade.groupby("d")) assert len(data) == 2 grouped_by_9 = next(g[1] for g in data if g[0]["d"] == 9) assert len(grouped_by_9) == 3 grouped_by_10 = next(g[1] for g in data if g[0]["d"] == 10) assert len(grouped_by_10) == 1 data = list(mongostore_facade.groupby(["e", "d"])) assert len(data) == 3 def test_multistore_remove_docs(multistore, mongostore): mongostore_facade = StoreFacade(mongostore, multistore) mongostore_facade._collection.insert_one({"a": 1, "b": 2, "c": 3}) mongostore_facade._collection.insert_one({"a": 4, "d": 5, "e": 6, "g": {"h": 1}}) mongostore_facade.remove_docs({"a": 1}) assert len(list(mongostore_facade.query({"a": 4}))) == 1 assert len(list(mongostore_facade.query({"a": 1}))) == 0 def test_multistore_connect_reconnect(multistore, mongostore): mongostore_facade = StoreFacade(mongostore, multistore) assert isinstance(mongostore_facade._collection, pymongo.collection.Collection) mongostore_facade.close() assert mongostore_facade._coll is None mongostore_facade.connect() # Test using the multistore to close connections multistore.close_all() assert mongostore_facade._coll is None multistore.connect_all() assert isinstance(mongostore_facade._collection, pymongo.collection.Collection) def test_multistore_name(multistore, mongostore): mongostore_facade = StoreFacade(mongostore, multistore) assert mongostore_facade.name == "mongo://localhost/maggma_test/test" def test_multistore_ensure_index(multistore, mongostore): mongostore_facade = StoreFacade(mongostore, multistore) assert mongostore_facade.ensure_index("test_key") # TODO: How to check for exception? maggma-0.70.0/tests/stores/test_ssh_tunnel.py000066400000000000000000000043121470132070100212760ustar00rootroot00000000000000import paramiko import pymongo import pytest from monty.serialization import dumpfn, loadfn from paramiko.ssh_exception import AuthenticationException, NoValidConnectionsError, SSHException from maggma.stores.mongolike import MongoStore from maggma.stores.ssh_tunnel import SSHTunnel @pytest.fixture() def ssh_server_available(): # noqa: PT004 """Fixture to determine if an SSH server is available to test the SSH tunnel.""" client = paramiko.SSHClient() client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) try: client.connect("127.0.0.1", 22) client.close() except (AuthenticationException, NoValidConnectionsError, SSHException): pytest.skip("No SSH server to test tunnel against") def local_port_available(local_port): """Fixture to determine if a local port is available to test the SSH tunnel.""" client = paramiko.SSHClient() client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) try: client.connect("127.0.0.1", local_port) client.close() except (AuthenticationException, NoValidConnectionsError, SSHException): pytest.skip("Local port unavailable to test tunnel against") @pytest.mark.parametrize("local_port", [None, 9000]) def test_mongostore_connect_via_ssh(ssh_server_available, local_port): if local_port is not None: local_port_available(local_port) tunnel = SSHTunnel("127.0.0.1:22", "127.0.0.1:27017", local_port=local_port) mongostore = MongoStore("maggma_test", "test", ssh_tunnel=tunnel) mongostore.connect() assert isinstance(mongostore._collection, pymongo.collection.Collection) mongostore.remove_docs({}) assert mongostore.count() == 0 mongostore.update([{"task_id": 0}]) assert mongostore.count() == 1 mongostore.remove_docs({}) mongostore.close() @pytest.mark.parametrize("local_port", [None, 9000]) def test_serialization(tmpdir, ssh_server_available, local_port): if local_port is not None: local_port_available(local_port) tunnel = SSHTunnel("127.0.0.1:22", "127.0.0.1:27017", local_port=local_port) dumpfn(tunnel, tmpdir / "tunnel.json") new_tunnel = loadfn(tmpdir / "tunnel.json") assert isinstance(new_tunnel, SSHTunnel) maggma-0.70.0/tests/test_files/000077500000000000000000000000001470132070100163255ustar00rootroot00000000000000maggma-0.70.0/tests/test_files/file_store_test/000077500000000000000000000000001470132070100215175ustar00rootroot00000000000000maggma-0.70.0/tests/test_files/file_store_test/calculation1/000077500000000000000000000000001470132070100240765ustar00rootroot00000000000000maggma-0.70.0/tests/test_files/file_store_test/calculation1/calc1_subdir/000077500000000000000000000000001470132070100264315ustar00rootroot00000000000000maggma-0.70.0/tests/test_files/file_store_test/calculation1/calc1_subdir/file_2_levels_deep.json000066400000000000000000000000001470132070100330210ustar00rootroot00000000000000maggma-0.70.0/tests/test_files/file_store_test/calculation1/input.in000066400000000000000000000001331470132070100255620ustar00rootroot00000000000000This is the file named input.in In directory calculation1 in the FileStore test directory. maggma-0.70.0/tests/test_files/file_store_test/calculation1/output.out000066400000000000000000000000001470132070100261550ustar00rootroot00000000000000maggma-0.70.0/tests/test_files/file_store_test/calculation2/000077500000000000000000000000001470132070100240775ustar00rootroot00000000000000maggma-0.70.0/tests/test_files/file_store_test/calculation2/input.in000066400000000000000000000000001470132070100255540ustar00rootroot00000000000000maggma-0.70.0/tests/test_files/file_store_test/calculation2/output.out000066400000000000000000000000001470132070100261560ustar00rootroot00000000000000maggma-0.70.0/tests/test_files/file_store_test/file_in_root.dat000066400000000000000000000000001470132070100246470ustar00rootroot00000000000000maggma-0.70.0/tests/test_files/settings_files/000077500000000000000000000000001470132070100213475ustar00rootroot00000000000000maggma-0.70.0/tests/test_files/settings_files/db.json000066400000000000000000000001731470132070100226300ustar00rootroot00000000000000{ "aliases": {}, "collection": "tmp", "database": "maggma_tests", "host": "localhost", "port": 27017 } maggma-0.70.0/tests/test_files/settings_files/my_launchpad.yaml000066400000000000000000000000631470132070100246760ustar00rootroot00000000000000host: "localhost" port: 27017 name: "maggma_tests" maggma-0.70.0/tests/test_files/simple_bib_example_data/000077500000000000000000000000001470132070100231365ustar00rootroot00000000000000maggma-0.70.0/tests/test_files/simple_bib_example_data/citations-1.bibtex000066400000000000000000000021031470132070100264640ustar00rootroot00000000000000@article{10.1093/cid/ciaa225, author = {Wang, Shaoshuai and Guo, Lili and Chen, Ling and Liu, Weiyong and Cao, Yong and Zhang, Jingyi and Feng, Ling}, title = "{A case report of neonatal COVID-19 infection in China}", journal = {Clinical Infectious Diseases}, year = {2020}, month = {03}, abstract = "{In December 2019, the 2019 novel coronavirus disease (COVID-19) caused by SARS-CoV-2 emerged in China and now has spread in many countries. Pregnant women are susceptible population of COVID-19 which are more likely to have complications and even progresse to severe illness. We report a case of neonatal COVID-19 infection in China with pharyngeal swabs tested positive by rRT-PCR assay 36 hours after birth. However, whether the case is a vertical transmission from mother to child remains to be confirmed.}", issn = {1058-4838}, doi = {10.1093/cid/ciaa225}, url = {https://doi.org/10.1093/cid/ciaa225}, note = {ciaa225}, eprint = {https://academic.oup.com/cid/advance-article-pdf/doi/10.1093/cid/ciaa225/32894061/ciaa225.pdf}, } maggma-0.70.0/tests/test_files/simple_bib_example_data/citations-2.bibtex000066400000000000000000000045271470132070100265010ustar00rootroot00000000000000@article{10.1093/cid/ciaa247, author = {Tang, Dahai and Yao, Feifei and Wang, Lijie and Zheng, Ling and Gao, Yongjun and Ye, Jun and Guo, Feng and Zhao, Hui and Gao, Rongbao}, title = "{A comparative study on the clinical features of COVID-19 pneumonia to other pneumonias}", journal = {Clinical Infectious Diseases}, year = {2020}, month = {03}, abstract = "{A novel coronavirus (2019-nCoV) has raised world concern since it emerged in Wuhan Hubei China in December, 2019. The infection may result into severe pneumonia with clusters illness onsets. Its impacts on public health make it paramount to clarify the clinical features with other pneumonias.Nineteen 2019-nCoV pneumonia (NCOVID-19) and fifteen other pneumonia patients (NON-NCOVID-19) in out of Hubei places were involved in this study. Both NCOVID-19 and NON-NCOVID-19 patients were confirmed to be infected in throat swabs or/and sputa with or without 2019-nCoV by real-time RT-PCR. We analyzed the demographic, epidemiological, clinical, and radiological features from those patients, and compared the difference between NCOVID-19 and NON-NCOVID-19.All patients had a history of exposure to confirmed case of 2019-nCoV or travel to Hubei before illness. The median duration, respectively, was 8 (IQR:6~11) and 5 (IQR:4~11) days from exposure to onset in NCOVID-19 and NON-NCOVID-19. The clinical symptoms were similar between NCOVID-19 and NON-NCOVID-19. The most common symptoms were fever and cough. Fifteen (78.95\\%) NCOVID-19 but 4 (26.67\\%) NON-NCOVID-19 patients had bilateral involvement while 17 (89.47\\%) NCOVID-19 but 1 (6.67\\%) NON-NCOVID-19 patients had multiple mottling and ground-glass opacity of chest CT images. Compared to NON-NCOVID-19, NCOVID-19 present remarkably more abnormal laboratory tests including AST, ALT, γ-GT, LDH and α-HBDH.The 2019-nCoV infection caused similar onsets to other pneumonias. CT scan may be a reliable test for screening NCOVID-19 cases. Liver function damage is more frequent in NCOVID-19 than NON-NCOVID-19 patients. LDH and α-HBDH may be considerable markers for evaluation of NCOVID-19.}", issn = {1058-4838}, doi = {10.1093/cid/ciaa247}, url = {https://doi.org/10.1093/cid/ciaa247}, note = {ciaa247}, eprint = {https://academic.oup.com/cid/advance-article-pdf/doi/10.1093/cid/ciaa247/32894214/ciaa247.pdf}, } maggma-0.70.0/tests/test_files/simple_bib_example_data/citations-3.bibtex000066400000000000000000000011461470132070100264740ustar00rootroot00000000000000@article{10.1093/cid/ciaa296, author = {Pan, Daniel and Wong, Nicholas and Toovey, Oliver and Hills, George and Stephenson, Iain}, title = "{A Multicenter longitudinal cohort study of cryptococcosis in Human Immunodeficiency Virus-negative people in the United States}", journal = {Clinical Infectious Diseases}, year = {2020}, month = {03}, issn = {1058-4838}, doi = {10.1093/cid/ciaa296}, url = {https://doi.org/10.1093/cid/ciaa296}, note = {ciaa296}, eprint = {https://academic.oup.com/cid/advance-article-pdf/doi/10.1093/cid/ciaa296/32928026/ciaa296.pdf}, } maggma-0.70.0/tests/test_files/simple_bib_example_data/citations-4.bibtex000066400000000000000000000057441470132070100265050ustar00rootroot00000000000000@article{10.1093/cid/ciaa290, author = {Borghesi, Alessandro and Trück, Johannes and Asgari, Samira and Sancho-Shimizu, Vanessa and Agyeman, Philipp K A and Bellos, Evangelos and Giannoni, Eric and Stocker, Martin and Posfay-Barbe, Klara M and Heininger, Ulrich and Bernhard-Stirnemann, Sara and Niederer-Loher, Anita and Kahlert, Christian R and Natalucci, Giancardlo and Relly, Christa and Riedel, Thomas and Kuehni, Claudia E and Thorball, Christian W and Chaturvedi, Nimisha and Martinon-Torres, Federico and Kuijpers, Taco W and Coin, Lachlan and Wright, Victoria and Herberg, Jethro and Levin, Michael and Aebi, Christoph and Berger, Christoph and Fellay, Jacques and Schlapbach, Luregn J and for the EUCLIDS consortium and the Swiss Paediatric Sepsis Study}, title = "{Whole-exome sequencing for the identification of rare variants in primary immunodeficiency genes in children with sepsis - a prospective population-based cohort study}", journal = {Clinical Infectious Diseases}, year = {2020}, month = {03}, abstract = "{The role of primary immunodeficiencies (PID) in susceptibility to sepsis remains unknown. It is unclear whether children with sepsis benefit from genetic investigations. We hypothesized that sepsis may represent the first manifestation of underlying PID. We applied whole-exome sequencing (WES) to a national cohort of children with sepsis to identify rare, predicted pathogenic variants in PID genes.Multicenter population-based prospective study including previously healthy children ≥28 days and \\<17 years admitted with blood culture-proven sepsis. Using a stringent variant filtering procedure, analysis of WES data was restricted to rare, predicted pathogenic variants in 240 PID genes for which increased susceptibility to bacterial infection has been reported.176 children presenting with 185 sepsis episodes underwent WES (median age 52 months, IQR 15.4-126.4). 41 unique predicted pathogenic PID variants (1 homozygous, 5 hemizygous, and 35 heterozygous) were found in 35/176 (20\\%) patients, including 3/176 (2\\%) patients carrying variants which were previously reported to lead to PID. The variants occurred in PID genes across all 8 PID categories as defined by the International Union of Immunological Societies. We did not observe a significant correlation between clinical or laboratory characteristics of patients and the presence or absence of PID variants.Applying WES to a population-based cohort of previously healthy children with bacterial sepsis detected Variants of Uncertain Significance in PID genes in one out of five children. Future studies need to investigate the functional relevance of these variants to determine whether variants in PID genes contribute to pediatric sepsis susceptibility.}", issn = {1058-4838}, doi = {10.1093/cid/ciaa290}, url = {https://doi.org/10.1093/cid/ciaa290}, note = {ciaa290}, eprint = {https://academic.oup.com/cid/advance-article-pdf/doi/10.1093/cid/ciaa290/32924399/ciaa290.pdf}, } maggma-0.70.0/tests/test_files/simple_bib_example_data/citations-5.bibtex000066400000000000000000000045411470132070100265000ustar00rootroot00000000000000@article{10.1093/cid/ciaa282, author = {Segeral, Olivier and Dim, Bunnet and Durier, Christine and Prak, Sophearot and Chhim, Kearena and Vong, Chanlina and Pech, Sothy and Tiv, Say and Nem, Bunthoeun and Hout, Kay and Nouhin, Janin and Chhun, Samsorphea and Borand, Laurence}, title = "{HBeAg rapid test and alanine aminotransferase level-based algorithm to identify pregnant women at risk of HBV mother-to-child transmission: the ANRS 12345 TA PROHM study}", journal = {Clinical Infectious Diseases}, year = {2020}, month = {03}, abstract = "{The paucity of Hepatitis B (HBV) DNA measurement in low and middle-income countries hinders the identification of HBV-infected pregnant women at-risk of perinatal transmission. This study evaluates the validity of an algorithm selecting HBeAg-positive women and HBeAg-negative women with alanine aminotransferase (ALT) ≥40 IU/L as predictor of high HBV DNA level.All women with reactive sample for HBsAg were assessed with SD BIOLINE HBeAg rapid test and HBV DNA quantification was performed. Validities of HBeAg and of the algorithm to identify HBV DNA higher than two thresholds (5.3 and 7.3 Log10 IU/mL) were evaluated.For the 515 positive HBsAg women, median age was 29 years, 92 (17.9\\%) were HBeAg positive, 47 (9.1\\%) were HBeAg negative with ALT ≥40 IU/L and 144 (28.0\\%) had an HBV DNA \\>5.3 Log10 UI/mL. Sensitivity and specificity of HBeAg were 61.8\\% and 99.2\\% for HBV DNA \\>5.3 Log10 UI/mL and 81.3\\% and 96.7\\% for HBV DNA \\>7.3 Log10 UI/mL. For the algorithm, sensitivity and specificity were 79.2\\% and 93.3\\% for HBV DNA level \\>5.3 Log10 UI/mL and 92.7\\% and 88.1\\% for HBV DNA \\>7.3 Log10 UI/mL. The AUCs for the algorithm (0.92 and 0.94 for HBV DNA \\>5.3 and 7.3 respectively) were significantly greater (p\\<0.001) than the AUCs for HBeAg (0.81 and 0.89 for HBV DNA \\>5.3 and 7.3 respectively).An algorithm using HBeAg and ALT level could be an effective strategy to identify HBV-infected pregnant women at risk of perinatal transmission in countries where HBV DNA quantification is not routinely available.}", issn = {1058-4838}, doi = {10.1093/cid/ciaa282}, url = {https://doi.org/10.1093/cid/ciaa282}, note = {ciaa282}, eprint = {https://academic.oup.com/cid/advance-article-pdf/doi/10.1093/cid/ciaa282/32928016/ciaa282.pdf}, } maggma-0.70.0/tests/test_files/simple_bib_example_data/citations-6.bibtex000066400000000000000000000043611470132070100265010ustar00rootroot00000000000000@article{10.1093/cid/ciaa295, author = {Mercier, Toine and Guldentops, Ellen and Lagrou, Katrien and Maertens, Johan}, title = "{Prospective evaluation of the turbidimetric β-D-glucan assay and two lateral flow assays on serum in invasive aspergillosis}", journal = {Clinical Infectious Diseases}, year = {2020}, month = {03}, abstract = "{Invasive aspergillosis (IA) remains a potentially lethal disease and requires timely diagnosis and initiation of antifungal therapy. Recently, the IMMY lateral flow assay (LFA), the OLM Diagnostics lateral flow device (LFD) and the Wako turbidimetric beta-D-glucan assay have been approved for use as a diagnostic aid. However, their performance in diagnosing IA on serum samples from at-risk patients and the added value to the existing detection of serum galactomannan remain to be investigated.We prospectively collected serum samples from 239 hematology patients and evaluated the diagnostic performance of these three assays while using the 2019 EORTC/MSG definitions.We identified five cases of proven IA, 36 cases of probable IA and 188 controls. The LFA had the highest negative predictive value (NPV) and sensitivity (0.90 and 0.49, respectively) while galactomannan detection had the highest positive predictive value (PPV) and specificity (0.93 and 0.99, respectively). Sensitivity was not significantly different between both tests. When used in combination, the highest NPV was seen in patients with a negative LFA and a negative beta-D-glucan test. The sensitivity of the LFD was significantly lower than the LFA. After omitting serum galactomannan from the definitions to control for incorporation bias, the sensitivity of the LFA outperformed galactomannan detection (0.41 versus 0.31, p=0.046).The LFA is a fast and effective alternative to serum galactomannan detection for the diagnosis of IA and is especially useful for centers with low sample throughputs. The addition of the Wako beta-D-glucan assay further improves the diagnostic performance.}", issn = {1058-4838}, doi = {10.1093/cid/ciaa295}, url = {https://doi.org/10.1093/cid/ciaa295}, note = {ciaa295}, eprint = {https://academic.oup.com/cid/advance-article-pdf/doi/10.1093/cid/ciaa295/32928020/ciaa295.pdf}, } maggma-0.70.0/tests/test_files/simple_bib_example_data/citations-7.bibtex000066400000000000000000000011131470132070100264720ustar00rootroot00000000000000@article{10.1093/cid/ciaa297, author = {Marr, Kieren A and Anjum, Seher and Hammoud, Dima A and Williamson, Peter R}, title = "{Neurologic Morbidity in People with Cryptococcosis in Absence of HIV Infection: Response to Letter to the Editor}", journal = {Clinical Infectious Diseases}, year = {2020}, month = {03}, issn = {1058-4838}, doi = {10.1093/cid/ciaa297}, url = {https://doi.org/10.1093/cid/ciaa297}, note = {ciaa297}, eprint = {https://academic.oup.com/cid/advance-article-pdf/doi/10.1093/cid/ciaa297/32928041/ciaa297.pdf}, } maggma-0.70.0/tests/test_files/simple_bib_example_data/text-1.txt000066400000000000000000000000071470132070100250160ustar00rootroot00000000000000text-1 maggma-0.70.0/tests/test_files/simple_bib_example_data/text-2.txt000066400000000000000000000000071470132070100250170ustar00rootroot00000000000000text-2 maggma-0.70.0/tests/test_files/simple_bib_example_data/text-4.txt000066400000000000000000000000071470132070100250210ustar00rootroot00000000000000text-4 maggma-0.70.0/tests/test_files/simple_bib_example_data/text-5.txt000066400000000000000000000000071470132070100250220ustar00rootroot00000000000000text-5 maggma-0.70.0/tests/test_files/simple_bib_example_data/text-6.txt000066400000000000000000000000071470132070100250230ustar00rootroot00000000000000text-6 maggma-0.70.0/tests/test_files/test_set/000077500000000000000000000000001470132070100201575ustar00rootroot00000000000000maggma-0.70.0/tests/test_files/test_set/a.json000066400000000000000000000003611470132070100212720ustar00rootroot00000000000000[{"A": 0, "task_id": 0}, {"A": 1, "task_id": 1}, {"A": 2, "task_id": 2}, {"A": 3, "task_id": 3}, {"A": 4, "task_id": 4}, {"A": 5, "task_id": 5}, {"A": 6, "task_id": 6}, {"A": 7, "task_id": 7}, {"A": 8, "task_id": 8}, {"A": 9, "task_id": 9}] maggma-0.70.0/tests/test_files/test_set/b.json000066400000000000000000000004051470132070100212720ustar00rootroot00000000000000[{"B": 10, "task_id": 10}, {"B": 11, "task_id": 11}, {"B": 12, "task_id": 12}, {"B": 13, "task_id": 13}, {"B": 14, "task_id": 14}, {"B": 15, "task_id": 15}, {"B": 16, "task_id": 16}, {"B": 17, "task_id": 17}, {"B": 18, "task_id": 18}, {"B": 19, "task_id": 19}] maggma-0.70.0/tests/test_files/test_set/c.json.gz000066400000000000000000000002351470132070100217130ustar00rootroot00000000000000,j[\c.json\л @WYpk.0)7o770