pax_global_header00006660000000000000000000000064150352717730014524gustar00rootroot0000000000000052 comment=a3e2ecd1f756a19cee15f85b96337a59c3b5337b cgroups-0.0.4/000077500000000000000000000000001503527177300132075ustar00rootroot00000000000000cgroups-0.0.4/.github/000077500000000000000000000000001503527177300145475ustar00rootroot00000000000000cgroups-0.0.4/.github/dependabot.yml000066400000000000000000000005201503527177300173740ustar00rootroot00000000000000# Please see the documentation for all configuration options: # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates version: 2 updates: # Dependencies listed in .github/workflows/*.yml - package-ecosystem: "github-actions" directory: "/" schedule: interval: "daily" cgroups-0.0.4/.github/workflows/000077500000000000000000000000001503527177300166045ustar00rootroot00000000000000cgroups-0.0.4/.github/workflows/test.yml000066400000000000000000000030551503527177300203110ustar00rootroot00000000000000name: test on: push: tags: - v* branches: - main - release-* pull_request: schedule: # Runs at 00:00 UTC every Monday - cron: '0 0 * * 1' permissions: contents: read jobs: cgroup-v2: name: "cgroup v2 (Ubuntu 24.04)" timeout-minutes: 10 strategy: fail-fast: false matrix: go-version: [1.23.x, 1.24.x] race: ["-race", ""] runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v4 - uses: actions/setup-go@v5 with: go-version: ${{ matrix.go-version }} - run: go test -timeout 3m ${{ matrix.race }} -v ./... cgroup-v1: name: "cgroup v1 (AlmaLinux 8)" timeout-minutes: 20 runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v4 - uses: lima-vm/lima-actions/setup@v1 id: lima-actions-setup - uses: actions/cache@v4 with: path: ~/.cache/lima key: lima-${{ steps.lima-actions-setup.outputs.version }} - name: "Start VM" # --plain is set to disable file sharing, port forwarding, built-in containerd, etc. for faster start up run: limactl start --plain --name=default template://almalinux-8 - name: "Initialize VM" run: | set -eux -o pipefail limactl cp -r . default:/tmp/cgroups lima sudo dnf install -y golang - name: "Run unit tests" run: LIMA_WORKDIR=/tmp/cgroups lima sudo GOTOOLCHAIN=auto go test -v ./... all-done: needs: - cgroup-v2 - cgroup-v1 runs-on: ubuntu-24.04 steps: - run: echo "All jobs completed" cgroups-0.0.4/.github/workflows/validate.yml000066400000000000000000000042101503527177300211150ustar00rootroot00000000000000name: validate on: push: tags: - v* branches: - main - release-* pull_request: schedule: # Runs at 00:00 UTC every Monday - cron: '0 0 * * 1' env: GO_VERSION: 1.24 permissions: contents: read jobs: lint: timeout-minutes: 30 permissions: contents: read pull-requests: read checks: write # to allow the action to annotate code in the PR. runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v4 with: fetch-depth: 2 - uses: actions/setup-go@v5 with: go-version: "${{ env.GO_VERSION }}" - uses: golangci/golangci-lint-action@v8 with: version: v2.1 # Extra linters, only checking new code from a pull request. - name: lint-extra if: github.event_name == 'pull_request' run: | golangci-lint run --config .golangci-extra.yml --new-from-rev=HEAD~1 go-fix: runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v4 with: fetch-depth: 2 - uses: actions/setup-go@v5 with: go-version: "${{ env.GO_VERSION }}" - name: run go fix run: | go fix ./... git diff --exit-code codespell: runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v4 - name: install deps # Use a known version of codespell. run: pip install --break-system-packages codespell==v2.4.1 - name: run codespell run: codespell space-at-eol: runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v4 - run: if git -P grep -I -n '\s$'; then echo "^^^ extra whitespace at EOL, please fix"; exit 1; fi deps: runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v4 - uses: actions/setup-go@v5 with: go-version: "${{ env.GO_VERSION }}" - run: go mod tidy --diff govulncheck: runs-on: ubuntu-24.04 steps: - uses: golang/govulncheck-action@v1 all-done: needs: - codespell - deps - go-fix - govulncheck - lint - space-at-eol runs-on: ubuntu-24.04 steps: - run: echo "All jobs completed" cgroups-0.0.4/.golangci-extra.yml000066400000000000000000000011331503527177300167120ustar00rootroot00000000000000# This is golangci-lint config file which is used to check NEW code in # github PRs only (see lint-extra in .github/workflows/validate.yml). # # For the default linter config, see .golangci.yml. This config should # only enable additional linters and/or linter settings not enabled # in the default config. version: "2" linters: default: none enable: - godot - revive - staticcheck settings: staticcheck: checks: - all - -QF1008 # https://staticcheck.dev/docs/checks/#QF1008 Omit embedded fields from selector expression. exclusions: generated: strict cgroups-0.0.4/.golangci.yml000066400000000000000000000015051503527177300155740ustar00rootroot00000000000000# For documentation, see https://golangci-lint.run/usage/configuration/ version: "2" formatters: enable: - gofumpt exclusions: generated: strict linters: enable: - errorlint - nolintlint - unconvert - unparam settings: govet: enable: - nilness staticcheck: checks: - all - -ST1000 # https://staticcheck.dev/docs/checks/#ST1000 Incorrect or missing package comment. - -ST1003 # https://staticcheck.dev/docs/checks/#ST1003 Poorly chosen identifier. - -ST1005 # https://staticcheck.dev/docs/checks/#ST1005 Incorrectly formatted error string. - -QF1008 # https://staticcheck.dev/docs/checks/#QF1008 Omit embedded fields from selector expression. exclusions: generated: strict presets: - comments - std-error-handling cgroups-0.0.4/CODEOWNERS000066400000000000000000000000511503527177300145760ustar00rootroot00000000000000* @maintainer1 @maintainer2 @maintainer3 cgroups-0.0.4/CONTRIBUTING.md000066400000000000000000000146001503527177300154410ustar00rootroot00000000000000# Contribution Guidelines Development happens on GitHub. Issues are used for bugs and actionable items and longer discussions can happen on the [mailing list](#mailing-list). The content of this repository is licensed under the [Apache License, Version 2.0](LICENSE). ## Code of Conduct Participation in the Open Container community is governed by [Open Container Code of Conduct][code-of-conduct]. ## Meetings The contributors and maintainers of all OCI projects have monthly meetings at 2:00 PM (USA Pacific) on the first Wednesday of every month. There is an [iCalendar][rfc5545] format for the meetings [here][meeting.ics]. Everyone is welcome to participate via [UberConference web][UberConference] or audio-only: +1 415 968 0849 (no PIN needed). An initial agenda will be posted to the [mailing list](#mailing-list) in the week before each meeting, and everyone is welcome to propose additional topics or suggest other agenda alterations there. Minutes from past meetings are archived [here][minutes]. ## Mailing list You can subscribe and browse the mailing list on [Google Groups][mailing-list]. ## IRC OCI discussion happens on #opencontainers on [Freenode][] ([logs][irc-logs]). ## Git ### Security issues If you are reporting a security issue, do not create an issue or file a pull request on GitHub. Instead, disclose the issue responsibly by sending an email to security@opencontainers.org (which is inhabited only by the maintainers of the various OCI projects). ### Pull requests are always welcome We are always thrilled to receive pull requests, and do our best to process them as fast as possible. Not sure if that typo is worth a pull request? Do it! We will appreciate it. If your pull request is not accepted on the first try, don't be discouraged! If there's a problem with the implementation, hopefully you received feedback on what to improve. We're trying very hard to keep the project lean and focused. We don't want it to do everything for everybody. This means that we might decide against incorporating a new feature. ### Conventions Fork the repo and make changes on your fork in a feature branch. For larger bugs and enhancements, consider filing a leader issue or mailing-list thread for discussion that is independent of the implementation. Small changes or changes that have been discussed on the [project mailing list](#mailing-list) may be submitted without a leader issue. If the project has a test suite, submit unit tests for your changes. Take a look at existing tests for inspiration. Run the full test suite on your branch before submitting a pull request. Update the documentation when creating or modifying features. Test your documentation changes for clarity, concision, and correctness, as well as a clean documentation build. Pull requests descriptions should be as clear as possible and include a reference to all the issues that they address. Commit messages must start with a capitalized and short summary written in the imperative, followed by an optional, more detailed explanatory text which is separated from the summary by an empty line. Code review comments may be added to your pull request. Discuss, then make the suggested modifications and push additional commits to your feature branch. Be sure to post a comment after pushing. The new commits will show up in the pull request automatically, but the reviewers will not be notified unless you comment. Before the pull request is merged, make sure that you squash your commits into logical units of work using `git rebase -i` and `git push -f`. After every commit the test suite (if any) should be passing. Include documentation changes in the same commit so that a revert would remove all traces of the feature or fix. Commits that fix or close an issue should include a reference like `Closes #XXX` or `Fixes #XXX`, which will automatically close the issue when merged. ### Sign your work The sign-off is a simple line at the end of the explanation for the patch, which certifies that you wrote it or otherwise have the right to pass it on as an open-source patch. The rules are pretty simple: if you can certify the below (from [developercertificate.org][]): ``` Developer Certificate of Origin Version 1.1 Copyright (C) 2004, 2006 The Linux Foundation and its contributors. 1 Letterman Drive Suite D4700 San Francisco, CA, 94129 Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Developer's Certificate of Origin 1.1 By making a contribution to this project, I certify that: (a) The contribution was created in whole or in part by me and I have the right to submit it under the open source license indicated in the file; or (b) The contribution is based upon previous work that, to the best of my knowledge, is covered under an appropriate open source license and I have the right under that license to submit that work with modifications, whether created in whole or in part by me, under the same open source license (unless I am permitted to submit under a different license), as indicated in the file; or (c) The contribution was provided directly to me by some other person who certified (a), (b) or (c) and I have not modified it. (d) I understand and agree that this project and the contribution are public and that a record of the contribution (including all personal information I submit with it, including my sign-off) is maintained indefinitely and may be redistributed consistent with this project or the open source license(s) involved. ``` then you just add a line to every git commit message: Signed-off-by: Joe Smith using your real name (sorry, no pseudonyms or anonymous contributions.) You can add the sign off when creating the git commit via `git commit -s`. [code-of-conduct]: https://github.com/opencontainers/tob/blob/d2f9d68c1332870e40693fe077d311e0742bc73d/code-of-conduct.md [developercertificate.org]: http://developercertificate.org/ [Freenode]: https://freenode.net/ [irc-logs]: http://ircbot.wl.linuxfoundation.org/eavesdrop/%23opencontainers/ [mailing-list]: https://groups.google.com/a/opencontainers.org/forum/#!forum/dev [meeting.ics]: https://github.com/opencontainers/runtime-spec/blob/master/meeting.ics [minutes]: http://ircbot.wl.linuxfoundation.org/meetings/opencontainers/ [rfc5545]: https://tools.ietf.org/html/rfc5545 [UberConference]: https://www.uberconference.com/opencontainers cgroups-0.0.4/GOVERNANCE.md000066400000000000000000000063041503527177300151630ustar00rootroot00000000000000# Project governance The [OCI charter][charter] §5.b.viii tasks an OCI Project's maintainers (listed in the repository's MAINTAINERS file and sometimes referred to as "the TDC", [§5.e][charter]) with: > Creating, maintaining and enforcing governance guidelines for the TDC, approved by the maintainers, and which shall be posted visibly for the TDC. This section describes generic rules and procedures for fulfilling that mandate. ## Proposing a motion A maintainer SHOULD propose a motion on the dev@opencontainers.org mailing list (except [security issues](#security-issues)) with another maintainer as a co-sponsor. ## Voting Voting on a proposed motion SHOULD happen on the dev@opencontainers.org mailing list (except [security issues](#security-issues)) with maintainers posting LGTM or REJECT. Maintainers MAY also explicitly not vote by posting ABSTAIN (which is useful to revert a previous vote). Maintainers MAY post multiple times (e.g. as they revise their position based on feedback), but only their final post counts in the tally. A proposed motion is adopted if two-thirds of votes cast, a quorum having voted, are in favor of the release. Voting SHOULD remain open for a week to collect feedback from the wider community and allow the maintainers to digest the proposed motion. Under exceptional conditions (e.g. non-major security fix releases) proposals which reach quorum with unanimous support MAY be adopted earlier. A maintainer MAY choose to reply with REJECT. A maintainer posting a REJECT MUST include a list of concerns or links to written documentation for those concerns (e.g. GitHub issues or mailing-list threads). The maintainers SHOULD try to resolve the concerns and wait for the rejecting maintainer to change their opinion to LGTM. However, a motion MAY be adopted with REJECTs, as outlined in the previous paragraphs. ## Quorum A quorum is established when at least two-thirds of maintainers have voted. For projects that are not specifications, a [motion to release](#release-approval) MAY be adopted if the tally is at least three LGTMs and no REJECTs, even if three votes does not meet the usual two-thirds quorum. ## Amendments The [project governance](#project-governance) rules and procedures MAY be amended or replaced using the procedures themselves. The MAINTAINERS of this project governance document is the total set of MAINTAINERS from all Open Containers projects (go-digest, image-spec, image-tools, runC, runtime-spec, runtime-tools, and selinux). ## Subject templates Maintainers are busy and get lots of email. To make project proposals recognizable, proposed motions SHOULD use the following subject templates. ### Proposing a motion > [{project} VOTE]: {motion description} (closes {end of voting window}) For example: > [runtime-spec VOTE]: Tag 0647920 as 1.0.0-rc (closes 2016-06-03 20:00 UTC) ### Tallying results After voting closes, a maintainer SHOULD post a tally to the motion thread with a subject template like: > [{project} {status}]: {motion description} (+{LGTMs} -{REJECTs} #{ABSTAINs}) Where `{status}` is either `adopted` or `rejected`. For example: > [runtime-spec adopted]: Tag 0647920 as 1.0.0-rc (+6 -0 #3) [charter]: https://www.opencontainers.org/about/governance cgroups-0.0.4/LICENSE000066400000000000000000000261351503527177300142230ustar00rootroot00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "{}" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright {yyyy} {name of copyright owner} Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. cgroups-0.0.4/MAINTAINERS000066400000000000000000000005721503527177300147100ustar00rootroot00000000000000Akihiro Suda (@AkihiroSuda) Aleksa Sarai (@cyphar) Kir Kolyshkin (@kolyshkin) Mrunal Patel (@mrunalp) Sebastiaan van Stijn (@thaJeztah) Odin Ugedal (@odinuge) Peter Hunt (@haircommander) Davanum Srinivas (@dims) cgroups-0.0.4/MAINTAINERS_GUIDE.md000066400000000000000000000113231503527177300162200ustar00rootroot00000000000000## Introduction Dear maintainer. Thank you for investing the time and energy to help make this project as useful as possible. Maintaining a project is difficult, sometimes unrewarding work. Sure, you will get to contribute cool features to the project. But most of your time will be spent reviewing, cleaning up, documenting, answering questions, justifying design decisions - while everyone has all the fun! But remember - the quality of the maintainers work is what distinguishes the good projects from the great. So please be proud of your work, even the unglamourous parts, and encourage a culture of appreciation and respect for *every* aspect of improving the project - not just the hot new features. This document is a manual for maintainers old and new. It explains what is expected of maintainers, how they should work, and what tools are available to them. This is a living document - if you see something out of date or missing, speak up! ## What are a maintainer's responsibilities? It is every maintainer's responsibility to: * Expose a clear roadmap for improving their component. * Deliver prompt feedback and decisions on pull requests. * Be available to anyone with questions, bug reports, criticism etc. on their component. This includes IRC and GitHub issues and pull requests. * Make sure their component respects the philosophy, design and roadmap of the project. ## How are decisions made? This project is an open-source project with an open design philosophy. This means that the repository is the source of truth for EVERY aspect of the project, including its philosophy, design, roadmap and APIs. *If it's part of the project, it's in the repo. It's in the repo, it's part of the project.* As a result, all decisions can be expressed as changes to the repository. An implementation change is a change to the source code. An API change is a change to the API specification. A philosophy change is a change to the philosophy manifesto. And so on. All decisions affecting this project, big and small, follow the same procedure: 1. Discuss a proposal on the [mailing list](CONTRIBUTING.md#mailing-list). Anyone can do this. 2. Open a pull request. Anyone can do this. 3. Discuss the pull request. Anyone can do this. 4. Endorse (`LGTM`) or oppose (`Rejected`) the pull request. The relevant maintainers do this (see below [Who decides what?](#who-decides-what)). Changes that affect project management (changing policy, cutting releases, etc.) are [proposed and voted on the mailing list](GOVERNANCE.md). 5. Merge or close the pull request. The relevant maintainers do this. ### I'm a maintainer, should I make pull requests too? Yes. Nobody should ever push to master directly. All changes should be made through a pull request. ## Who decides what? All decisions are pull requests, and the relevant maintainers make decisions by accepting or refusing the pull request. Review and acceptance by anyone is denoted by adding a comment in the pull request: `LGTM`. However, only currently listed `MAINTAINERS` are counted towards the required two LGTMs. In addition, if a maintainer has created a pull request, they cannot count toward the two LGTM rule (to ensure equal amounts of review for every pull request, no matter who wrote it). Overall the maintainer system works because of mutual respect. The maintainers trust one another to act in the best interests of the project. Sometimes maintainers can disagree and this is part of a healthy project to represent the points of view of various people. In the case where maintainers cannot find agreement on a specific change, maintainers should use the [governance procedure](GOVERNANCE.md) to attempt to reach a consensus. ### How are maintainers added? The best maintainers have a vested interest in the project. Maintainers are first and foremost contributors that have shown they are committed to the long term success of the project. Contributors wanting to become maintainers are expected to be deeply involved in contributing code, pull request review, and triage of issues in the project for more than two months. Just contributing does not make you a maintainer, it is about building trust with the current maintainers of the project and being a person that they can depend on to act in the best interest of the project. The final vote to add a new maintainer should be approved by the [governance procedure](GOVERNANCE.md). ### How are maintainers removed? When a maintainer is unable to perform the [required duties](#what-are-a-maintainers-responsibilities) they can be removed by the [governance procedure](GOVERNANCE.md). Issues related to a maintainer's performance should be discussed with them among the other maintainers so that they are not surprised by a pull request removing them. cgroups-0.0.4/README.md000066400000000000000000000005601503527177300144670ustar00rootroot00000000000000# OCI Project Template Useful boilerplate and organizational information for all OCI projects. * README (this file) * [The Apache License, Version 2.0](LICENSE) * [A list of maintainers](MAINTAINERS) * [Maintainer guidelines](MAINTAINERS_GUIDE.md) * [Contributor guidelines](CONTRIBUTING.md) * [Project governance](GOVERNANCE.md) * [Release procedures](RELEASES.md) cgroups-0.0.4/RELEASES.md000066400000000000000000000066761503527177300147530ustar00rootroot00000000000000# Releases The release process hopes to encourage early, consistent consensus-building during project development. The mechanisms used are regular community communication on the mailing list about progress, scheduled meetings for issue resolution and release triage, and regularly paced and communicated releases. Releases are proposed and adopted or rejected using the usual [project governance](GOVERNANCE.md) rules and procedures. An anti-pattern that we want to avoid is heavy development or discussions "late cycle" around major releases. We want to build a community that is involved and communicates consistently through all releases instead of relying on "silent periods" as a judge of stability. ## Parallel releases A single project MAY consider several motions to release in parallel. However each motion to release after the initial 0.1.0 MUST be based on a previous release that has already landed. For example, runtime-spec maintainers may propose a v1.0.0-rc2 on the 1st of the month and a v0.9.1 bugfix on the 2nd of the month. They may not propose a v1.0.0-rc3 until the v1.0.0-rc2 is accepted (on the 7th if the vote initiated on the 1st passes). ## Specifications The OCI maintains three categories of projects: specifications, applications, and conformance-testing tools. However, specification releases have special restrictions in the [OCI charter][charter]: * They are the target of backwards compatibility (§7.g), and * They are subject to the OFWa patent grant (§8.d and e). To avoid unfortunate side effects (onerous backwards compatibility requirements or Member resignations), the following additional procedures apply to specification releases: ### Planning a release Every OCI specification project SHOULD hold meetings that involve maintainers reviewing pull requests, debating outstanding issues, and planning releases. This meeting MUST be advertised on the project README and MAY happen on a phone call, video conference, or on IRC. Maintainers MUST send updates to the dev@opencontainers.org with results of these meetings. Before the specification reaches v1.0.0, the meetings SHOULD be weekly. Once a specification has reached v1.0.0, the maintainers may alter the cadence, but a meeting MUST be held within four weeks of the previous meeting. The release plans, corresponding milestones and estimated due dates MUST be published on GitHub (e.g. https://github.com/opencontainers/runtime-spec/milestones). GitHub milestones and issues are only used for community organization and all releases MUST follow the [project governance](GOVERNANCE.md) rules and procedures. ### Timelines Specifications have a variety of different timelines in their lifecycle. * Pre-v1.0.0 specifications SHOULD release on a monthly cadence to garner feedback. * Major specification releases MUST release at least three release candidates spaced a minimum of one week apart. This means a major release like a v1.0.0 or v2.0.0 release will take 1 month at minimum: one week for rc1, one week for rc2, one week for rc3, and one week for the major release itself. Maintainers SHOULD strive to make zero breaking changes during this cycle of release candidates and SHOULD restart the three-candidate count when a breaking change is introduced. For example if a breaking change is introduced in v1.0.0-rc2 then the series would end with v1.0.0-rc4 and v1.0.0. * Minor and patch releases SHOULD be made on an as-needed basis. [charter]: https://www.opencontainers.org/about/governance cgroups-0.0.4/cgroups.go000066400000000000000000000050011503527177300152140ustar00rootroot00000000000000package cgroups import ( "errors" ) var ( // ErrDevicesUnsupported is an error returned when a cgroup manager // is not configured to set device rules. ErrDevicesUnsupported = errors.New("cgroup manager is not configured to set device rules") // ErrRootless is returned by [Manager.Apply] when there is an error // creating cgroup directory, and cgroup.Rootless is set. In general, // this error is to be ignored. ErrRootless = errors.New("cgroup manager can not access cgroup (rootless container)") // DevicesSetV1 and DevicesSetV2 are functions to set devices for // cgroup v1 and v2, respectively. Unless // [github.com/opencontainers/cgroups/devices] // package is imported, it is set to nil, so cgroup managers can't // manage devices. DevicesSetV1 func(path string, r *Resources) error DevicesSetV2 func(path string, r *Resources) error ) type Manager interface { // Apply creates a cgroup, if not yet created, and adds a process // with the specified pid into that cgroup. A special value of -1 // can be used to merely create a cgroup. Apply(pid int) error // GetPids returns the PIDs of all processes inside the cgroup. GetPids() ([]int, error) // GetAllPids returns the PIDs of all processes inside the cgroup // any all its sub-cgroups. GetAllPids() ([]int, error) // GetStats returns cgroups statistics. GetStats() (*Stats, error) // Freeze sets the freezer cgroup to the specified state. Freeze(state FreezerState) error // Destroy removes cgroup. Destroy() error // Path returns a cgroup path to the specified controller/subsystem. // For cgroupv2, the argument is unused and can be empty. Path(string) string // Set sets cgroup resources parameters/limits. If the argument is nil, // the resources specified during Manager creation (or the previous call // to Set) are used. Set(r *Resources) error // GetPaths returns cgroup path(s) to save in a state file in order to // restore later. // // For cgroup v1, a key is cgroup subsystem name, and the value is the // path to the cgroup for this subsystem. // // For cgroup v2 unified hierarchy, a key is "", and the value is the // unified path. GetPaths() map[string]string // GetCgroups returns the cgroup data as configured. GetCgroups() (*Cgroup, error) // GetFreezerState retrieves the current FreezerState of the cgroup. GetFreezerState() (FreezerState, error) // Exists returns whether the cgroup path exists or not. Exists() bool // OOMKillCount reports OOM kill count for the cgroup. OOMKillCount() (uint64, error) } cgroups-0.0.4/cgroups_test.go000066400000000000000000000006601503527177300162610ustar00rootroot00000000000000package cgroups import ( "testing" ) func TestParseCgroups(t *testing.T) { // We don't need to use /proc/thread-self here because runc always runs // with every thread in the same cgroup. This lets us avoid having to do // runtime.LockOSThread. cgroups, err := ParseCgroupFile("/proc/self/cgroup") if err != nil { t.Fatal(err) } if IsCgroup2UnifiedMode() { return } if _, ok := cgroups["cpu"]; !ok { t.Fail() } } cgroups-0.0.4/config_blkio_device.go000066400000000000000000000041541503527177300175060ustar00rootroot00000000000000package cgroups import "fmt" // BlockIODevice holds major:minor format supported in blkio cgroup. type BlockIODevice struct { // Major is the device's major number Major int64 `json:"major"` // Minor is the device's minor number Minor int64 `json:"minor"` } // WeightDevice struct holds a `major:minor weight`|`major:minor leaf_weight` pair type WeightDevice struct { BlockIODevice // Weight is the bandwidth rate for the device, range is from 10 to 1000 Weight uint16 `json:"weight"` // LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only LeafWeight uint16 `json:"leafWeight"` } // NewWeightDevice returns a configured WeightDevice pointer func NewWeightDevice(major, minor int64, weight, leafWeight uint16) *WeightDevice { wd := &WeightDevice{} wd.Major = major wd.Minor = minor wd.Weight = weight wd.LeafWeight = leafWeight return wd } // WeightString formats the struct to be writable to the cgroup specific file func (wd *WeightDevice) WeightString() string { return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.Weight) } // LeafWeightString formats the struct to be writable to the cgroup specific file func (wd *WeightDevice) LeafWeightString() string { return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.LeafWeight) } // ThrottleDevice struct holds a `major:minor rate_per_second` pair type ThrottleDevice struct { BlockIODevice // Rate is the IO rate limit per cgroup per device Rate uint64 `json:"rate"` } // NewThrottleDevice returns a configured ThrottleDevice pointer func NewThrottleDevice(major, minor int64, rate uint64) *ThrottleDevice { td := &ThrottleDevice{} td.Major = major td.Minor = minor td.Rate = rate return td } // String formats the struct to be writable to the cgroup specific file func (td *ThrottleDevice) String() string { return fmt.Sprintf("%d:%d %d", td.Major, td.Minor, td.Rate) } // StringName formats the struct to be writable to the cgroup specific file func (td *ThrottleDevice) StringName(name string) string { return fmt.Sprintf("%d:%d %s=%d", td.Major, td.Minor, name, td.Rate) } cgroups-0.0.4/config_hugepages.go000066400000000000000000000002641503527177300170350ustar00rootroot00000000000000package cgroups type HugepageLimit struct { // which type of hugepage to limit. Pagesize string `json:"page_size"` // usage limit for hugepage. Limit uint64 `json:"limit"` } cgroups-0.0.4/config_ifprio_map.go000066400000000000000000000003541503527177300172120ustar00rootroot00000000000000package cgroups import ( "fmt" ) type IfPrioMap struct { Interface string `json:"interface"` Priority int64 `json:"priority"` } func (i *IfPrioMap) CgroupString() string { return fmt.Sprintf("%s %d", i.Interface, i.Priority) } cgroups-0.0.4/config_linux.go000066400000000000000000000154371503527177300162340ustar00rootroot00000000000000package cgroups import ( systemdDbus "github.com/coreos/go-systemd/v22/dbus" devices "github.com/opencontainers/cgroups/devices/config" ) type FreezerState string const ( Undefined FreezerState = "" Frozen FreezerState = "FROZEN" Thawed FreezerState = "THAWED" ) // Cgroup holds properties of a cgroup on Linux. type Cgroup struct { // Name specifies the name of the cgroup Name string `json:"name,omitempty"` // Parent specifies the name of parent of cgroup or slice Parent string `json:"parent,omitempty"` // Path specifies the path to cgroups that are created and/or joined by the container. // The path is assumed to be relative to the host system cgroup mountpoint. Path string `json:"path,omitempty"` // ScopePrefix describes prefix for the scope name. ScopePrefix string `json:"scope_prefix,omitempty"` // Resources contains various cgroups settings to apply. *Resources // Systemd tells if systemd should be used to manage cgroups. Systemd bool `json:"Systemd,omitempty"` // SystemdProps are any additional properties for systemd, // derived from org.systemd.property.xxx annotations. // Ignored unless systemd is used for managing cgroups. SystemdProps []systemdDbus.Property `json:"-"` // Rootless tells if rootless cgroups should be used. Rootless bool `json:"Rootless,omitempty"` // The host UID that should own the cgroup, or nil to accept // the default ownership. This should only be set when the // cgroupfs is to be mounted read/write. // Not all cgroup manager implementations support changing // the ownership. OwnerUID *int `json:"owner_uid,omitempty"` } type Resources struct { // Devices is the set of access rules for devices in the container. Devices []*devices.Rule `json:"devices,omitempty"` // Memory limit (in bytes). Memory int64 `json:"memory,omitempty"` // Memory reservation or soft_limit (in bytes). MemoryReservation int64 `json:"memory_reservation,omitempty"` // Total memory usage (memory+swap); use -1 for unlimited swap. MemorySwap int64 `json:"memory_swap,omitempty"` // CPU shares (relative weight vs. other containers). CpuShares uint64 `json:"cpu_shares,omitempty"` //nolint:revive // Suppress "var-naming: struct field CpuShares should be CPUShares". // CPU hardcap limit (in usecs). Allowed cpu time in a given period. CpuQuota int64 `json:"cpu_quota,omitempty"` //nolint:revive // Suppress "var-naming: struct field CpuQuota should be CPUQuota". // CPU hardcap burst limit (in usecs). Allowed accumulated cpu time additionally for burst in a given period. CpuBurst *uint64 `json:"cpu_burst,omitempty"` //nolint:revive // Suppress "var-naming: struct field CpuBurst should be CPUBurst". // CPU period to be used for hardcapping (in usecs). 0 to use system default. CpuPeriod uint64 `json:"cpu_period,omitempty"` //nolint:revive // Suppress "var-naming: struct field CpuPeriod should be CPUPeriod". // How many time CPU will use in realtime scheduling (in usecs). CpuRtRuntime int64 `json:"cpu_rt_quota,omitempty"` //nolint:revive // Suppress "var-naming: struct field CpuRtRuntime should be CPURtRuntime". // CPU period to be used for realtime scheduling (in usecs). CpuRtPeriod uint64 `json:"cpu_rt_period,omitempty"` //nolint:revive // Suppress "var-naming: struct field CpuQuota should be CPUQuota". // Cpuset CPUs to use. CpusetCpus string `json:"cpuset_cpus,omitempty"` // Cpuset memory nodes to use. CpusetMems string `json:"cpuset_mems,omitempty"` // Cgroup's SCHED_IDLE value. CPUIdle *int64 `json:"cpu_idle,omitempty"` // Process limit; set <= `0' to disable limit. PidsLimit int64 `json:"pids_limit,omitempty"` // Specifies per cgroup weight, range is from 10 to 1000. BlkioWeight uint16 `json:"blkio_weight,omitempty"` // Tasks' weight in the given cgroup while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only. BlkioLeafWeight uint16 `json:"blkio_leaf_weight,omitempty"` // Weight per cgroup per device, can override BlkioWeight. BlkioWeightDevice []*WeightDevice `json:"blkio_weight_device,omitempty"` // IO read rate limit per cgroup per device, bytes per second. BlkioThrottleReadBpsDevice []*ThrottleDevice `json:"blkio_throttle_read_bps_device,omitempty"` // IO write rate limit per cgroup per device, bytes per second. BlkioThrottleWriteBpsDevice []*ThrottleDevice `json:"blkio_throttle_write_bps_device,omitempty"` // IO read rate limit per cgroup per device, IO per second. BlkioThrottleReadIOPSDevice []*ThrottleDevice `json:"blkio_throttle_read_iops_device,omitempty"` // IO write rate limit per cgroup per device, IO per second. BlkioThrottleWriteIOPSDevice []*ThrottleDevice `json:"blkio_throttle_write_iops_device,omitempty"` // Freeze value for the process. Freezer FreezerState `json:"freezer,omitempty"` // Hugetlb limit (in bytes). HugetlbLimit []*HugepageLimit `json:"hugetlb_limit,omitempty"` // Whether to disable OOM killer. OomKillDisable bool `json:"oom_kill_disable,omitempty"` // Tuning swappiness behaviour per cgroup. MemorySwappiness *uint64 `json:"memory_swappiness,omitempty"` // Set priority of network traffic for container. NetPrioIfpriomap []*IfPrioMap `json:"net_prio_ifpriomap,omitempty"` // Set class identifier for container's network packets. NetClsClassid uint32 `json:"net_cls_classid_u,omitempty"` // Rdma resource restriction configuration. Rdma map[string]LinuxRdma `json:"rdma,omitempty"` // Used on cgroups v2: // CpuWeight sets a proportional bandwidth limit. CpuWeight uint64 `json:"cpu_weight,omitempty"` //nolint:revive // Suppress "var-naming: struct field CpuWeight should be CPUWeight". // Unified is cgroupv2-only key-value map. Unified map[string]string `json:"unified,omitempty"` // SkipDevices allows to skip configuring device permissions. // Used by e.g. kubelet while creating a parent cgroup (kubepods) // common for many containers, and by runc update. // // NOTE it is impossible to start a container which has this flag set. SkipDevices bool `json:"-"` // SkipFreezeOnSet is a flag for cgroup manager to skip the cgroup // freeze when setting resources. Only applicable to systemd legacy // (i.e. cgroup v1) manager (which uses freeze by default to avoid // spurious permission errors caused by systemd inability to update // device rules in a non-disruptive manner). // // If not set, a few methods (such as looking into cgroup's // devices.list and querying the systemd unit properties) are used // during Set() to figure out whether the freeze is required. Those // methods may be relatively slow, thus this flag. SkipFreezeOnSet bool `json:"-"` // MemoryCheckBeforeUpdate is a flag for cgroup v2 managers to check // if the new memory limits (Memory and MemorySwap) being set are lower // than the current memory usage, and reject if so. MemoryCheckBeforeUpdate bool `json:"memory_check_before_update,omitempty"` } cgroups-0.0.4/config_rdma.go000066400000000000000000000005621503527177300160110ustar00rootroot00000000000000package cgroups // LinuxRdma for Linux cgroup 'rdma' resource management (Linux 4.11) type LinuxRdma struct { // Maximum number of HCA handles that can be opened. Default is "no limit". HcaHandles *uint32 `json:"hca_handles,omitempty"` // Maximum number of HCA objects that can be created. Default is "no limit". HcaObjects *uint32 `json:"hca_objects,omitempty"` } cgroups-0.0.4/config_unsupported.go000066400000000000000000000003371503527177300174560ustar00rootroot00000000000000//go:build !linux package cgroups // Cgroup holds properties of a cgroup on Linux // TODO Windows: This can ultimately be entirely factored out on Windows as // cgroups are a Unix-specific construct. type Cgroup struct{} cgroups-0.0.4/devices/000077500000000000000000000000001503527177300146315ustar00rootroot00000000000000cgroups-0.0.4/devices/config/000077500000000000000000000000001503527177300160765ustar00rootroot00000000000000cgroups-0.0.4/devices/config/device.go000066400000000000000000000071651503527177300176750ustar00rootroot00000000000000package config import ( "fmt" "os" "strconv" ) const ( Wildcard = -1 ) type Device struct { Rule // Path to the device. Path string `json:"path"` // FileMode permission bits for the device. FileMode os.FileMode `json:"file_mode"` // Uid of the device. Uid uint32 `json:"uid,omitempty"` //nolint:revive // Suppress "var-naming: struct field Uid should be UID". // Gid of the device. Gid uint32 `json:"gid,omitempty"` //nolint:revive // Suppress "var-naming: struct field Gid should be GID". } // Permissions is a cgroupv1-style string to represent device access. It // has to be a string for backward compatibility reasons, hence why it has // methods to do set operations. type Permissions string const ( deviceRead uint = (1 << iota) deviceWrite deviceMknod ) func (p Permissions) toSet() uint { var set uint for _, perm := range p { switch perm { case 'r': set |= deviceRead case 'w': set |= deviceWrite case 'm': set |= deviceMknod } } return set } func fromSet(set uint) Permissions { var perm string if set&deviceRead == deviceRead { perm += "r" } if set&deviceWrite == deviceWrite { perm += "w" } if set&deviceMknod == deviceMknod { perm += "m" } return Permissions(perm) } // Union returns the union of the two sets of Permissions. func (p Permissions) Union(o Permissions) Permissions { lhs := p.toSet() rhs := o.toSet() return fromSet(lhs | rhs) } // Difference returns the set difference of the two sets of Permissions. // In set notation, A.Difference(B) gives you A\B. func (p Permissions) Difference(o Permissions) Permissions { lhs := p.toSet() rhs := o.toSet() return fromSet(lhs &^ rhs) } // Intersection computes the intersection of the two sets of Permissions. func (p Permissions) Intersection(o Permissions) Permissions { lhs := p.toSet() rhs := o.toSet() return fromSet(lhs & rhs) } // IsEmpty returns whether the set of permissions in a Permissions is // empty. func (p Permissions) IsEmpty() bool { return p == Permissions("") } // IsValid returns whether the set of permissions is a subset of valid // permissions (namely, {r,w,m}). func (p Permissions) IsValid() bool { return p == fromSet(p.toSet()) } type Type rune const ( WildcardDevice Type = 'a' BlockDevice Type = 'b' CharDevice Type = 'c' // or 'u' FifoDevice Type = 'p' ) func (t Type) IsValid() bool { switch t { case WildcardDevice, BlockDevice, CharDevice, FifoDevice: return true default: return false } } func (t Type) CanMknod() bool { switch t { case BlockDevice, CharDevice, FifoDevice: return true default: return false } } func (t Type) CanCgroup() bool { switch t { case WildcardDevice, BlockDevice, CharDevice: return true default: return false } } type Rule struct { // Type of device ('c' for char, 'b' for block). If set to 'a', this rule // acts as a wildcard and all fields other than Allow are ignored. Type Type `json:"type"` // Major is the device's major number. Major int64 `json:"major"` // Minor is the device's minor number. Minor int64 `json:"minor"` // Permissions is the set of permissions that this rule applies to (in the // cgroupv1 format -- any combination of "rwm"). Permissions Permissions `json:"permissions"` // Allow specifies whether this rule is allowed. Allow bool `json:"allow"` } func (d *Rule) CgroupString() string { var ( major = strconv.FormatInt(d.Major, 10) minor = strconv.FormatInt(d.Minor, 10) ) if d.Major == Wildcard { major = "*" } if d.Minor == Wildcard { minor = "*" } return fmt.Sprintf("%c %s:%s %s", d.Type, major, minor, d.Permissions) } func (d *Rule) Mkdev() (uint64, error) { return mkDev(d) } cgroups-0.0.4/devices/config/mknod_unix.go000066400000000000000000000004251503527177300206010ustar00rootroot00000000000000package config import ( "errors" "golang.org/x/sys/unix" ) func mkDev(d *Rule) (uint64, error) { if d.Major == Wildcard || d.Minor == Wildcard { return 0, errors.New("cannot mkdev() device with wildcards") } return unix.Mkdev(uint32(d.Major), uint32(d.Minor)), nil } cgroups-0.0.4/devices/devicefilter.go000066400000000000000000000137311503527177300176320ustar00rootroot00000000000000// Implements creation of eBPF device filter program. // // Based on https://github.com/containers/crun/blob/0.10.2/src/libcrun/ebpf.c // // Although ebpf.c is originally licensed under LGPL-3.0-or-later, the author (Giuseppe Scrivano) // agreed to relicense the file in Apache License 2.0: https://github.com/opencontainers/runc/issues/2144#issuecomment-543116397 package devices import ( "errors" "fmt" "math" "strconv" "github.com/cilium/ebpf/asm" devices "github.com/opencontainers/cgroups/devices/config" "golang.org/x/sys/unix" ) const ( // license string format is same as kernel MODULE_LICENSE macro license = "Apache" ) // deviceFilter returns eBPF device filter program and its license string. func deviceFilter(rules []*devices.Rule) (asm.Instructions, string, error) { // Generate the minimum ruleset for the device rules we are given. While we // don't care about minimum transitions in cgroupv2, using the emulator // gives us a guarantee that the behaviour of devices filtering is the same // as cgroupv1, including security hardenings to avoid misconfiguration // (such as punching holes in wildcard rules). emu := new(emulator) for _, rule := range rules { if err := emu.Apply(*rule); err != nil { return nil, "", err } } cleanRules, err := emu.Rules() if err != nil { return nil, "", err } p := &program{ defaultAllow: emu.IsBlacklist(), } p.init() for idx, rule := range cleanRules { if rule.Type == devices.WildcardDevice { // We can safely skip over wildcard entries because there should // only be one (at most) at the very start to instruct cgroupv1 to // go into allow-list mode. However we do double-check this here. if idx != 0 || rule.Allow != emu.IsBlacklist() { return nil, "", fmt.Errorf("[internal error] emulated cgroupv2 devices ruleset had bad wildcard at idx %v (%s)", idx, rule.CgroupString()) } continue } if rule.Allow == p.defaultAllow { // There should be no rules which have an action equal to the // default action, the emulator removes those. return nil, "", fmt.Errorf("[internal error] emulated cgroupv2 devices ruleset had no-op rule at idx %v (%s)", idx, rule.CgroupString()) } if err := p.appendRule(rule); err != nil { return nil, "", err } } return p.finalize(), license, nil } type program struct { insts asm.Instructions defaultAllow bool blockID int } func (p *program) init() { // struct bpf_cgroup_dev_ctx: https://elixir.bootlin.com/linux/v5.3.6/source/include/uapi/linux/bpf.h#L3423 /* u32 access_type u32 major u32 minor */ // R2 <- type (lower 16 bit of u32 access_type at R1[0]) p.insts = append(p.insts, asm.LoadMem(asm.R2, asm.R1, 0, asm.Word), asm.And.Imm32(asm.R2, 0xFFFF)) // R3 <- access (upper 16 bit of u32 access_type at R1[0]) p.insts = append(p.insts, asm.LoadMem(asm.R3, asm.R1, 0, asm.Word), // RSh: bitwise shift right asm.RSh.Imm32(asm.R3, 16)) // R4 <- major (u32 major at R1[4]) p.insts = append(p.insts, asm.LoadMem(asm.R4, asm.R1, 4, asm.Word)) // R5 <- minor (u32 minor at R1[8]) p.insts = append(p.insts, asm.LoadMem(asm.R5, asm.R1, 8, asm.Word)) } // appendRule rule converts an OCI rule to the relevant eBPF block and adds it // to the in-progress filter program. In order to operate properly, it must be // called with a "clean" rule list (generated by devices.Emulator.Rules() -- // with any "a" rules removed). func (p *program) appendRule(rule *devices.Rule) error { if p.blockID < 0 { return errors.New("the program is finalized") } var bpfType int32 switch rule.Type { case devices.CharDevice: bpfType = int32(unix.BPF_DEVCG_DEV_CHAR) case devices.BlockDevice: bpfType = int32(unix.BPF_DEVCG_DEV_BLOCK) default: // We do not permit 'a', nor any other types we don't know about. return fmt.Errorf("invalid type %q", string(rule.Type)) } if rule.Major > math.MaxUint32 { return fmt.Errorf("invalid major %d", rule.Major) } if rule.Minor > math.MaxUint32 { return fmt.Errorf("invalid minor %d", rule.Major) } hasMajor := rule.Major >= 0 // if not specified in OCI json, major is set to -1 hasMinor := rule.Minor >= 0 bpfAccess := int32(0) for _, r := range rule.Permissions { switch r { case 'r': bpfAccess |= unix.BPF_DEVCG_ACC_READ case 'w': bpfAccess |= unix.BPF_DEVCG_ACC_WRITE case 'm': bpfAccess |= unix.BPF_DEVCG_ACC_MKNOD default: return fmt.Errorf("unknown device access %v", r) } } // If the access is rwm, skip the check. hasAccess := bpfAccess != (unix.BPF_DEVCG_ACC_READ | unix.BPF_DEVCG_ACC_WRITE | unix.BPF_DEVCG_ACC_MKNOD) var ( blockSym = "block-" + strconv.Itoa(p.blockID) nextBlockSym = "block-" + strconv.Itoa(p.blockID+1) prevBlockLastIdx = len(p.insts) - 1 ) p.insts = append(p.insts, // if (R2 != bpfType) goto next asm.JNE.Imm(asm.R2, bpfType, nextBlockSym), ) if hasAccess { p.insts = append(p.insts, // if (R3 & bpfAccess != R3 /* use R1 as a temp var */) goto next asm.Mov.Reg32(asm.R1, asm.R3), asm.And.Imm32(asm.R1, bpfAccess), asm.JNE.Reg(asm.R1, asm.R3, nextBlockSym), ) } if hasMajor { p.insts = append(p.insts, // if (R4 != major) goto next asm.JNE.Imm(asm.R4, int32(rule.Major), nextBlockSym), ) } if hasMinor { p.insts = append(p.insts, // if (R5 != minor) goto next asm.JNE.Imm(asm.R5, int32(rule.Minor), nextBlockSym), ) } p.insts = append(p.insts, acceptBlock(rule.Allow)...) // set blockSym to the first instruction we added in this iteration p.insts[prevBlockLastIdx+1] = p.insts[prevBlockLastIdx+1].WithSymbol(blockSym) p.blockID++ return nil } func (p *program) finalize() asm.Instructions { var v int32 if p.defaultAllow { v = 1 } blockSym := "block-" + strconv.Itoa(p.blockID) p.insts = append(p.insts, // R0 <- v asm.Mov.Imm32(asm.R0, v).WithSymbol(blockSym), asm.Return(), ) p.blockID = -1 return p.insts } func acceptBlock(accept bool) asm.Instructions { var v int32 if accept { v = 1 } return []asm.Instruction{ // R0 <- v asm.Mov.Imm32(asm.R0, v), asm.Return(), } } cgroups-0.0.4/devices/devicefilter_test.go000066400000000000000000000212721503527177300206700ustar00rootroot00000000000000package devices import ( "strings" "testing" devices "github.com/opencontainers/cgroups/devices/config" ) func hash(s, comm string) string { var res []string for _, l := range strings.Split(s, "\n") { trimmed := strings.TrimSpace(l) if trimmed == "" || strings.HasPrefix(trimmed, comm) { continue } res = append(res, trimmed) } return strings.Join(res, "\n") } func testDeviceFilter(t testing.TB, devices []*devices.Rule, expectedStr string) { insts, _, err := deviceFilter(devices) if err != nil { t.Fatalf("%s: %v (devices: %+v)", t.Name(), err, devices) } s := insts.String() if expectedStr != "" { hashed := hash(s, "//") expectedHashed := hash(expectedStr, "//") if expectedHashed != hashed { t.Fatalf("expected:\n%q\ngot\n%q", expectedHashed, hashed) } } } func TestDeviceFilter_Nil(t *testing.T) { expected := ` // load parameters into registers 0: LdXMemW dst: r2 src: r1 off: 0 imm: 0 1: AndImm32 dst: r2 imm: 65535 2: LdXMemW dst: r3 src: r1 off: 0 imm: 0 3: RShImm32 dst: r3 imm: 16 4: LdXMemW dst: r4 src: r1 off: 4 imm: 0 5: LdXMemW dst: r5 src: r1 off: 8 imm: 0 block-0: // return 0 (reject) 6: MovImm32 dst: r0 imm: 0 7: Exit ` testDeviceFilter(t, nil, expected) } func TestDeviceFilter_BuiltInAllowList(t *testing.T) { // This is a copy of all rules from // github.com/opencontainers/runc/libcontainer/specconv.AllowedDevices. devices := []*devices.Rule{ { Type: devices.CharDevice, Major: devices.Wildcard, Minor: devices.Wildcard, Permissions: "m", Allow: true, }, { Type: devices.BlockDevice, Major: devices.Wildcard, Minor: devices.Wildcard, Permissions: "m", Allow: true, }, { Type: devices.CharDevice, Major: 1, Minor: 3, Permissions: "rwm", Allow: true, }, { Type: devices.CharDevice, Major: 1, Minor: 8, Permissions: "rwm", Allow: true, }, { Type: devices.CharDevice, Major: 1, Minor: 7, Permissions: "rwm", Allow: true, }, { Type: devices.CharDevice, Major: 5, Minor: 0, Permissions: "rwm", Allow: true, }, { Type: devices.CharDevice, Major: 1, Minor: 5, Permissions: "rwm", Allow: true, }, { Type: devices.CharDevice, Major: 1, Minor: 9, Permissions: "rwm", Allow: true, }, { Type: devices.CharDevice, Major: 136, Minor: devices.Wildcard, Permissions: "rwm", Allow: true, }, { Type: devices.CharDevice, Major: 5, Minor: 2, Permissions: "rwm", Allow: true, }, { Type: devices.CharDevice, Major: 10, Minor: 200, Permissions: "rwm", Allow: true, }, } expected := ` // load parameters into registers 0: LdXMemW dst: r2 src: r1 off: 0 imm: 0 1: AndImm32 dst: r2 imm: 65535 2: LdXMemW dst: r3 src: r1 off: 0 imm: 0 3: RShImm32 dst: r3 imm: 16 4: LdXMemW dst: r4 src: r1 off: 4 imm: 0 5: LdXMemW dst: r5 src: r1 off: 8 imm: 0 block-0: // (b, wildcard, wildcard, m, true) 6: JNEImm dst: r2 off: -1 imm: 1 7: MovReg32 dst: r1 src: r3 8: AndImm32 dst: r1 imm: 1 9: JNEReg dst: r1 off: -1 src: r3 10: MovImm32 dst: r0 imm: 1 11: Exit block-1: // (c, wildcard, wildcard, m, true) 12: JNEImm dst: r2 off: -1 imm: 2 13: MovReg32 dst: r1 src: r3 14: AndImm32 dst: r1 imm: 1 15: JNEReg dst: r1 off: -1 src: r3 16: MovImm32 dst: r0 imm: 1 17: Exit block-2: 18: JNEImm dst: r2 off: -1 imm: 2 19: JNEImm dst: r4 off: -1 imm: 1 20: JNEImm dst: r5 off: -1 imm: 3 21: MovImm32 dst: r0 imm: 1 22: Exit block-3: 23: JNEImm dst: r2 off: -1 imm: 2 24: JNEImm dst: r4 off: -1 imm: 1 25: JNEImm dst: r5 off: -1 imm: 5 26: MovImm32 dst: r0 imm: 1 27: Exit block-4: 28: JNEImm dst: r2 off: -1 imm: 2 29: JNEImm dst: r4 off: -1 imm: 1 30: JNEImm dst: r5 off: -1 imm: 7 31: MovImm32 dst: r0 imm: 1 32: Exit block-5: 33: JNEImm dst: r2 off: -1 imm: 2 34: JNEImm dst: r4 off: -1 imm: 1 35: JNEImm dst: r5 off: -1 imm: 8 36: MovImm32 dst: r0 imm: 1 37: Exit block-6: 38: JNEImm dst: r2 off: -1 imm: 2 39: JNEImm dst: r4 off: -1 imm: 1 40: JNEImm dst: r5 off: -1 imm: 9 41: MovImm32 dst: r0 imm: 1 42: Exit block-7: 43: JNEImm dst: r2 off: -1 imm: 2 44: JNEImm dst: r4 off: -1 imm: 5 45: JNEImm dst: r5 off: -1 imm: 0 46: MovImm32 dst: r0 imm: 1 47: Exit block-8: 48: JNEImm dst: r2 off: -1 imm: 2 49: JNEImm dst: r4 off: -1 imm: 5 50: JNEImm dst: r5 off: -1 imm: 2 51: MovImm32 dst: r0 imm: 1 52: Exit block-9: // tuntap (c, 10, 200, rwm, true) 53: JNEImm dst: r2 off: -1 imm: 2 54: JNEImm dst: r4 off: -1 imm: 10 55: JNEImm dst: r5 off: -1 imm: 200 56: MovImm32 dst: r0 imm: 1 57: Exit block-10: // /dev/pts (c, 136, wildcard, rwm, true) 58: JNEImm dst: r2 off: -1 imm: 2 59: JNEImm dst: r4 off: -1 imm: 136 60: MovImm32 dst: r0 imm: 1 61: Exit block-11: 62: MovImm32 dst: r0 imm: 0 63: Exit ` testDeviceFilter(t, devices, expected) } func TestDeviceFilter_Privileged(t *testing.T) { devices := []*devices.Rule{ { Type: 'a', Major: -1, Minor: -1, Permissions: "rwm", Allow: true, }, } expected := ` // load parameters into registers 0: LdXMemW dst: r2 src: r1 off: 0 imm: 0 1: AndImm32 dst: r2 imm: 65535 2: LdXMemW dst: r3 src: r1 off: 0 imm: 0 3: RShImm32 dst: r3 imm: 16 4: LdXMemW dst: r4 src: r1 off: 4 imm: 0 5: LdXMemW dst: r5 src: r1 off: 8 imm: 0 block-0: // return 1 (accept) 6: MovImm32 dst: r0 imm: 1 7: Exit ` testDeviceFilter(t, devices, expected) } func TestDeviceFilter_PrivilegedExceptSingleDevice(t *testing.T) { devices := []*devices.Rule{ { Type: 'a', Major: -1, Minor: -1, Permissions: "rwm", Allow: true, }, { Type: 'b', Major: 8, Minor: 0, Permissions: "rwm", Allow: false, }, } expected := ` // load parameters into registers 0: LdXMemW dst: r2 src: r1 off: 0 imm: 0 1: AndImm32 dst: r2 imm: 65535 2: LdXMemW dst: r3 src: r1 off: 0 imm: 0 3: RShImm32 dst: r3 imm: 16 4: LdXMemW dst: r4 src: r1 off: 4 imm: 0 5: LdXMemW dst: r5 src: r1 off: 8 imm: 0 block-0: // return 0 (reject) if type==b && major == 8 && minor == 0 6: JNEImm dst: r2 off: -1 imm: 1 7: JNEImm dst: r4 off: -1 imm: 8 8: JNEImm dst: r5 off: -1 imm: 0 9: MovImm32 dst: r0 imm: 0 10: Exit block-1: // return 1 (accept) 11: MovImm32 dst: r0 imm: 1 12: Exit ` testDeviceFilter(t, devices, expected) } func TestDeviceFilter_Weird(t *testing.T) { devices := []*devices.Rule{ { Type: 'b', Major: 8, Minor: 1, Permissions: "rwm", Allow: false, }, { Type: 'a', Major: -1, Minor: -1, Permissions: "rwm", Allow: true, }, { Type: 'b', Major: 8, Minor: 2, Permissions: "rwm", Allow: false, }, } // 8/1 is allowed, 8/2 is not allowed. // This conforms to runc v1.0.0-rc.9 (cgroup1) behavior. expected := ` // load parameters into registers 0: LdXMemW dst: r2 src: r1 off: 0 imm: 0 1: AndImm32 dst: r2 imm: 65535 2: LdXMemW dst: r3 src: r1 off: 0 imm: 0 3: RShImm32 dst: r3 imm: 16 4: LdXMemW dst: r4 src: r1 off: 4 imm: 0 5: LdXMemW dst: r5 src: r1 off: 8 imm: 0 block-0: // return 0 (reject) if type==b && major == 8 && minor == 2 6: JNEImm dst: r2 off: -1 imm: 1 7: JNEImm dst: r4 off: -1 imm: 8 8: JNEImm dst: r5 off: -1 imm: 2 9: MovImm32 dst: r0 imm: 0 10: Exit block-1: // return 1 (accept) 11: MovImm32 dst: r0 imm: 1 12: Exit ` testDeviceFilter(t, devices, expected) } cgroups-0.0.4/devices/devices.go000066400000000000000000000007041503527177300166030ustar00rootroot00000000000000// Package devices contains functionality to manage cgroup devices, which // is exposed indirectly via libcontainer/cgroups managers. // // To enable cgroup managers to manage devices, this package must be imported. package devices import ( "github.com/opencontainers/cgroups" "github.com/opencontainers/cgroups/systemd" ) func init() { cgroups.DevicesSetV1 = setV1 cgroups.DevicesSetV2 = setV2 systemd.GenerateDeviceProps = systemdProperties } cgroups-0.0.4/devices/devices_emulator.go000066400000000000000000000301111503527177300205060ustar00rootroot00000000000000// SPDX-License-Identifier: Apache-2.0 /* * Copyright (C) 2020 Aleksa Sarai * Copyright (C) 2020 SUSE LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package devices import ( "bufio" "fmt" "io" "sort" "strconv" "strings" devices "github.com/opencontainers/cgroups/devices/config" ) // deviceMeta is a Rule without the Allow or Permissions fields, and no // wildcard-type support. It's effectively the "match" portion of a metadata // rule, for the purposes of our emulation. type deviceMeta struct { node devices.Type major int64 minor int64 } // deviceRule is effectively the tuple (deviceMeta, Permissions). type deviceRule struct { meta deviceMeta perms devices.Permissions } // deviceRules is a mapping of device metadata rules to the associated // permissions in the ruleset. type deviceRules map[deviceMeta]devices.Permissions func (r deviceRules) orderedEntries() []deviceRule { var rules []deviceRule for meta, perms := range r { rules = append(rules, deviceRule{meta: meta, perms: perms}) } sort.Slice(rules, func(i, j int) bool { // Sort by (major, minor, type). a, b := rules[i].meta, rules[j].meta return a.major < b.major || (a.major == b.major && a.minor < b.minor) || (a.major == b.major && a.minor == b.minor && a.node < b.node) }) return rules } type emulator struct { defaultAllow bool rules deviceRules } func (e *emulator) IsBlacklist() bool { return e.defaultAllow } func (e *emulator) IsAllowAll() bool { return e.IsBlacklist() && len(e.rules) == 0 } func parseLine(line string) (*deviceRule, error) { // Input: node major:minor perms. fields := strings.FieldsFunc(line, func(r rune) bool { return r == ' ' || r == ':' }) if len(fields) != 4 { return nil, fmt.Errorf("malformed devices.list rule %s", line) } var ( rule deviceRule node = fields[0] major = fields[1] minor = fields[2] perms = fields[3] ) // Parse the node type. switch node { case "a": // Super-special case -- "a" always means every device with every // access mode. In fact, for devices.list this actually indicates that // the cgroup is in black-list mode. // TODO: Double-check that the entire file is "a *:* rwm". return nil, nil case "b": rule.meta.node = devices.BlockDevice case "c": rule.meta.node = devices.CharDevice default: return nil, fmt.Errorf("unknown device type %q", node) } // Parse the major number. if major == "*" { rule.meta.major = devices.Wildcard } else { val, err := strconv.ParseUint(major, 10, 32) if err != nil { return nil, fmt.Errorf("invalid major number: %w", err) } rule.meta.major = int64(val) } // Parse the minor number. if minor == "*" { rule.meta.minor = devices.Wildcard } else { val, err := strconv.ParseUint(minor, 10, 32) if err != nil { return nil, fmt.Errorf("invalid minor number: %w", err) } rule.meta.minor = int64(val) } // Parse the access permissions. rule.perms = devices.Permissions(perms) if !rule.perms.IsValid() || rule.perms.IsEmpty() { return nil, fmt.Errorf("parse access mode: contained unknown modes or is empty: %q", perms) } return &rule, nil } func (e *emulator) addRule(rule deviceRule) error { //nolint:unparam if e.rules == nil { e.rules = make(map[deviceMeta]devices.Permissions) } // Merge with any pre-existing permissions. oldPerms := e.rules[rule.meta] newPerms := rule.perms.Union(oldPerms) e.rules[rule.meta] = newPerms return nil } func (e *emulator) rmRule(rule deviceRule) error { // Give an error if any of the permissions requested to be removed are // present in a partially-matching wildcard rule, because such rules will // be ignored by cgroupv1. // // This is a diversion from cgroupv1, but is necessary to avoid leading // users into a false sense of security. cgroupv1 will silently(!) ignore // requests to remove partial exceptions, but we really shouldn't do that. // // It may seem like we could just "split" wildcard rules which hit this // issue, but unfortunately there are 2^32 possible major and minor // numbers, which would exhaust kernel memory quickly if we did this. Not // to mention it'd be really slow (the kernel side is implemented as a // linked-list of exceptions). for _, partialMeta := range []deviceMeta{ {node: rule.meta.node, major: devices.Wildcard, minor: rule.meta.minor}, {node: rule.meta.node, major: rule.meta.major, minor: devices.Wildcard}, {node: rule.meta.node, major: devices.Wildcard, minor: devices.Wildcard}, } { // This wildcard rule is equivalent to the requested rule, so skip it. if rule.meta == partialMeta { continue } // Only give an error if the set of permissions overlap. partialPerms := e.rules[partialMeta] if !partialPerms.Intersection(rule.perms).IsEmpty() { return fmt.Errorf("requested rule [%v %v] not supported by devices cgroupv1 (cannot punch hole in existing wildcard rule [%v %v])", rule.meta, rule.perms, partialMeta, partialPerms) } } // Subtract all of the permissions listed from the full match rule. If the // rule didn't exist, all of this is a no-op. newPerms := e.rules[rule.meta].Difference(rule.perms) if newPerms.IsEmpty() { delete(e.rules, rule.meta) } else { e.rules[rule.meta] = newPerms } // TODO: The actual cgroup code doesn't care if an exception didn't exist // during removal, so not erroring out here is /accurate/ but quite // worrying. Maybe we should do additional validation, but again we // have to worry about backwards-compatibility. return nil } func (e *emulator) allow(rule *deviceRule) error { // This cgroup is configured as a black-list. Reset the entire emulator, // and put is into black-list mode. if rule == nil || rule.meta.node == devices.WildcardDevice { *e = emulator{ defaultAllow: true, rules: nil, } return nil } var err error if e.defaultAllow { err = wrapErr(e.rmRule(*rule), "unable to remove 'deny' exception") } else { err = wrapErr(e.addRule(*rule), "unable to add 'allow' exception") } return err } func (e *emulator) deny(rule *deviceRule) error { // This cgroup is configured as a white-list. Reset the entire emulator, // and put is into white-list mode. if rule == nil || rule.meta.node == devices.WildcardDevice { *e = emulator{ defaultAllow: false, rules: nil, } return nil } var err error if e.defaultAllow { err = wrapErr(e.addRule(*rule), "unable to add 'deny' exception") } else { err = wrapErr(e.rmRule(*rule), "unable to remove 'allow' exception") } return err } func (e *emulator) Apply(rule devices.Rule) error { if !rule.Type.CanCgroup() { return fmt.Errorf("cannot add rule [%#v] with non-cgroup type %q", rule, rule.Type) } innerRule := &deviceRule{ meta: deviceMeta{ node: rule.Type, major: rule.Major, minor: rule.Minor, }, perms: rule.Permissions, } if innerRule.meta.node == devices.WildcardDevice { innerRule = nil } if rule.Allow { return e.allow(innerRule) } return e.deny(innerRule) } // emulatorFromList takes a reader to a "devices.list"-like source, and returns // a new emulator that represents the state of the devices cgroup. Note that // black-list devices cgroups cannot be fully reconstructed, due to limitations // in the devices cgroup API. Instead, such cgroups are always treated as // "allow all" cgroups. func emulatorFromList(list io.Reader) (*emulator, error) { // Normally cgroups are in black-list mode by default, but the way we // figure out the current mode is whether or not devices.list has an // allow-all rule. So we default to a white-list, and the existence of an // "a *:* rwm" entry will tell us otherwise. e := &emulator{ defaultAllow: false, } // Parse the "devices.list". s := bufio.NewScanner(list) for s.Scan() { line := s.Text() deviceRule, err := parseLine(line) if err != nil { return nil, fmt.Errorf("error parsing line %q: %w", line, err) } // "devices.list" is an allow list. Note that this means that in // black-list mode, we have no idea what rules are in play. As a // result, we need to be very careful in Transition(). if err := e.allow(deviceRule); err != nil { return nil, fmt.Errorf("error adding devices.list rule: %w", err) } } if err := s.Err(); err != nil { return nil, fmt.Errorf("error reading devices.list lines: %w", err) } return e, nil } // Transition calculates what is the minimally-disruptive set of rules need to // be applied to a devices cgroup in order to transition to the given target. // This means that any already-existing rules will not be applied, and // disruptive rules (like denying all device access) will only be applied if // necessary. // // This function is the sole reason for all of emulator -- to allow us // to figure out how to update a containers' cgroups without causing spurious // device errors (if possible). func (e *emulator) Transition(target *emulator) ([]*devices.Rule, error) { var transitionRules []*devices.Rule source := e oldRules := source.rules // If the default policy doesn't match, we need to include a "disruptive" // rule (either allow-all or deny-all) in order to switch the cgroup to the // correct default policy. // // However, due to a limitation in "devices.list" we cannot be sure what // deny rules are in place in a black-list cgroup. Thus if the source is a // black-list we also have to include a disruptive rule. if source.IsBlacklist() || source.defaultAllow != target.defaultAllow { transitionRules = append(transitionRules, &devices.Rule{ Type: 'a', Major: -1, Minor: -1, Permissions: devices.Permissions("rwm"), Allow: target.defaultAllow, }) // The old rules are only relevant if we aren't starting out with a // disruptive rule. oldRules = nil } // NOTE: We traverse through the rules in a sorted order so we always write // the same set of rules (this is to aid testing). // First, we create inverse rules for any old rules not in the new set. // This includes partial-inverse rules for specific permissions. This is a // no-op if we added a disruptive rule, since oldRules will be empty. for _, rule := range oldRules.orderedEntries() { meta, oldPerms := rule.meta, rule.perms newPerms := target.rules[meta] droppedPerms := oldPerms.Difference(newPerms) if !droppedPerms.IsEmpty() { transitionRules = append(transitionRules, &devices.Rule{ Type: meta.node, Major: meta.major, Minor: meta.minor, Permissions: droppedPerms, Allow: target.defaultAllow, }) } } // Add any additional rules which weren't in the old set. We happen to // filter out rules which are present in both sets, though this isn't // strictly necessary. for _, rule := range target.rules.orderedEntries() { meta, newPerms := rule.meta, rule.perms oldPerms := oldRules[meta] gainedPerms := newPerms.Difference(oldPerms) if !gainedPerms.IsEmpty() { transitionRules = append(transitionRules, &devices.Rule{ Type: meta.node, Major: meta.major, Minor: meta.minor, Permissions: gainedPerms, Allow: !target.defaultAllow, }) } } return transitionRules, nil } // Rules returns the minimum set of rules necessary to convert a *deny-all* // cgroup to the emulated filter state (note that this is not the same as a // default cgroupv1 cgroup -- which is allow-all). This is effectively just a // wrapper around Transition() with the source emulator being an empty cgroup. func (e *emulator) Rules() ([]*devices.Rule, error) { defaultCgroup := &emulator{defaultAllow: false} return defaultCgroup.Transition(e) } func wrapErr(err error, text string) error { if err == nil { return nil } return fmt.Errorf(text+": %w", err) } cgroups-0.0.4/devices/devices_emulator_test.go000066400000000000000000000622101503527177300215520ustar00rootroot00000000000000// SPDX-License-Identifier: Apache-2.0 /* * Copyright (C) 2020 Aleksa Sarai * Copyright (C) 2020 SUSE LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package devices import ( "bufio" "bytes" "reflect" "strings" "testing" devices "github.com/opencontainers/cgroups/devices/config" ) func TestDeviceEmulatorLoad(t *testing.T) { tests := []struct { name, list string expected *emulator }{ { name: "BlacklistMode", list: `a *:* rwm`, expected: &emulator{ defaultAllow: true, }, }, { name: "WhitelistBasic", list: `c 4:2 rw`, expected: &emulator{ defaultAllow: false, rules: deviceRules{ { node: devices.CharDevice, major: 4, minor: 2, }: devices.Permissions("rw"), }, }, }, { name: "WhitelistWildcard", list: `b 0:* m`, expected: &emulator{ defaultAllow: false, rules: deviceRules{ { node: devices.BlockDevice, major: 0, minor: devices.Wildcard, }: devices.Permissions("m"), }, }, }, { name: "WhitelistDuplicate", list: `c *:* rwm c 1:1 r`, expected: &emulator{ defaultAllow: false, rules: deviceRules{ { node: devices.CharDevice, major: devices.Wildcard, minor: devices.Wildcard, }: devices.Permissions("rwm"), // To match the kernel, we allow redundant rules. { node: devices.CharDevice, major: 1, minor: 1, }: devices.Permissions("r"), }, }, }, { name: "WhitelistComplicated", list: `c *:* m b *:* m c 1:3 rwm c 1:5 rwm c 1:7 rwm c 1:8 rwm c 1:9 rwm c 5:0 rwm c 5:2 rwm c 136:* rwm c 10:200 rwm`, expected: &emulator{ defaultAllow: false, rules: deviceRules{ { node: devices.CharDevice, major: devices.Wildcard, minor: devices.Wildcard, }: devices.Permissions("m"), { node: devices.BlockDevice, major: devices.Wildcard, minor: devices.Wildcard, }: devices.Permissions("m"), { node: devices.CharDevice, major: 1, minor: 3, }: devices.Permissions("rwm"), { node: devices.CharDevice, major: 1, minor: 5, }: devices.Permissions("rwm"), { node: devices.CharDevice, major: 1, minor: 7, }: devices.Permissions("rwm"), { node: devices.CharDevice, major: 1, minor: 8, }: devices.Permissions("rwm"), { node: devices.CharDevice, major: 1, minor: 9, }: devices.Permissions("rwm"), { node: devices.CharDevice, major: 5, minor: 0, }: devices.Permissions("rwm"), { node: devices.CharDevice, major: 5, minor: 2, }: devices.Permissions("rwm"), { node: devices.CharDevice, major: 136, minor: devices.Wildcard, }: devices.Permissions("rwm"), { node: devices.CharDevice, major: 10, minor: 200, }: devices.Permissions("rwm"), }, }, }, // Some invalid lists. { name: "InvalidFieldNumber", list: `b 1:0`, expected: nil, }, { name: "InvalidDeviceType", list: `p *:* rwm`, expected: nil, }, { name: "InvalidMajorNumber1", list: `p -1:3 rwm`, expected: nil, }, { name: "InvalidMajorNumber2", list: `c foo:27 rwm`, expected: nil, }, { name: "InvalidMinorNumber1", list: `b 1:-4 rwm`, expected: nil, }, { name: "InvalidMinorNumber2", list: `b 1:foo rwm`, expected: nil, }, { name: "InvalidPermissions", list: `b 1:7 rwk`, expected: nil, }, } for _, test := range tests { test := test // capture range variable t.Run(test.name, func(t *testing.T) { list := bytes.NewBufferString(test.list) emu, err := emulatorFromList(list) if err != nil && test.expected != nil { t.Fatalf("unexpected failure when creating emulator: %v", err) } else if err == nil && test.expected == nil { t.Fatalf("unexpected success when creating emulator: %#v", emu) } if !reflect.DeepEqual(emu, test.expected) { t.Errorf("final emulator state mismatch: %#v != %#v", emu, test.expected) } }) } } func testDeviceEmulatorApply(t *testing.T, baseDefaultAllow bool) { tests := []struct { name string rule devices.Rule base, expected *emulator }{ // Switch between default modes. { name: "SwitchToOtherMode", rule: devices.Rule{ Type: devices.WildcardDevice, Major: devices.Wildcard, Minor: devices.Wildcard, Permissions: devices.Permissions("rwm"), Allow: !baseDefaultAllow, }, base: &emulator{ defaultAllow: baseDefaultAllow, rules: deviceRules{ { node: devices.CharDevice, major: devices.Wildcard, minor: devices.Wildcard, }: devices.Permissions("rwm"), { node: devices.CharDevice, major: 1, minor: 1, }: devices.Permissions("r"), }, }, expected: &emulator{ defaultAllow: !baseDefaultAllow, rules: nil, }, }, { name: "SwitchToSameModeNoop", rule: devices.Rule{ Type: devices.WildcardDevice, Major: devices.Wildcard, Minor: devices.Wildcard, Permissions: devices.Permissions("rwm"), Allow: baseDefaultAllow, }, base: &emulator{ defaultAllow: baseDefaultAllow, rules: nil, }, expected: &emulator{ defaultAllow: baseDefaultAllow, rules: nil, }, }, { name: "SwitchToSameMode", rule: devices.Rule{ Type: devices.WildcardDevice, Major: devices.Wildcard, Minor: devices.Wildcard, Permissions: devices.Permissions("rwm"), Allow: baseDefaultAllow, }, base: &emulator{ defaultAllow: baseDefaultAllow, rules: deviceRules{ { node: devices.CharDevice, major: devices.Wildcard, minor: devices.Wildcard, }: devices.Permissions("rwm"), { node: devices.CharDevice, major: 1, minor: 1, }: devices.Permissions("r"), }, }, expected: &emulator{ defaultAllow: baseDefaultAllow, rules: nil, }, }, // Rule addition logic. { name: "RuleAdditionBasic", rule: devices.Rule{ Type: devices.CharDevice, Major: 42, Minor: 1337, Permissions: devices.Permissions("rm"), Allow: !baseDefaultAllow, }, base: &emulator{ defaultAllow: baseDefaultAllow, rules: deviceRules{ { node: devices.CharDevice, major: 2, minor: 1, }: devices.Permissions("rwm"), { node: devices.BlockDevice, major: 1, minor: 5, }: devices.Permissions("r"), }, }, expected: &emulator{ defaultAllow: baseDefaultAllow, rules: deviceRules{ { node: devices.CharDevice, major: 2, minor: 1, }: devices.Permissions("rwm"), { node: devices.BlockDevice, major: 1, minor: 5, }: devices.Permissions("r"), { node: devices.CharDevice, major: 42, minor: 1337, }: devices.Permissions("rm"), }, }, }, { name: "RuleAdditionBasicDuplicate", rule: devices.Rule{ Type: devices.CharDevice, Major: 42, Minor: 1337, Permissions: devices.Permissions("rm"), Allow: !baseDefaultAllow, }, base: &emulator{ defaultAllow: baseDefaultAllow, rules: deviceRules{ { node: devices.CharDevice, major: 42, minor: devices.Wildcard, }: devices.Permissions("rwm"), }, }, expected: &emulator{ defaultAllow: baseDefaultAllow, rules: deviceRules{ { node: devices.CharDevice, major: 42, minor: devices.Wildcard, }: devices.Permissions("rwm"), // To match the kernel, we allow redundant rules. { node: devices.CharDevice, major: 42, minor: 1337, }: devices.Permissions("rm"), }, }, }, { name: "RuleAdditionBasicDuplicateNoop", rule: devices.Rule{ Type: devices.CharDevice, Major: 42, Minor: 1337, Permissions: devices.Permissions("rm"), Allow: !baseDefaultAllow, }, base: &emulator{ defaultAllow: baseDefaultAllow, rules: deviceRules{ { node: devices.CharDevice, major: 42, minor: 1337, }: devices.Permissions("rm"), }, }, expected: &emulator{ defaultAllow: baseDefaultAllow, rules: deviceRules{ { node: devices.CharDevice, major: 42, minor: 1337, }: devices.Permissions("rm"), }, }, }, { name: "RuleAdditionMerge", rule: devices.Rule{ Type: devices.BlockDevice, Major: 5, Minor: 12, Permissions: devices.Permissions("rm"), Allow: !baseDefaultAllow, }, base: &emulator{ defaultAllow: baseDefaultAllow, rules: deviceRules{ { node: devices.CharDevice, major: 2, minor: 1, }: devices.Permissions("rwm"), { node: devices.BlockDevice, major: 5, minor: 12, }: devices.Permissions("rw"), }, }, expected: &emulator{ defaultAllow: baseDefaultAllow, rules: deviceRules{ { node: devices.CharDevice, major: 2, minor: 1, }: devices.Permissions("rwm"), { node: devices.BlockDevice, major: 5, minor: 12, }: devices.Permissions("rwm"), }, }, }, { name: "RuleAdditionMergeWildcard", rule: devices.Rule{ Type: devices.BlockDevice, Major: 5, Minor: devices.Wildcard, Permissions: devices.Permissions("rm"), Allow: !baseDefaultAllow, }, base: &emulator{ defaultAllow: baseDefaultAllow, rules: deviceRules{ { node: devices.CharDevice, major: 2, minor: 1, }: devices.Permissions("rwm"), { node: devices.BlockDevice, major: 5, minor: devices.Wildcard, }: devices.Permissions("rw"), }, }, expected: &emulator{ defaultAllow: baseDefaultAllow, rules: deviceRules{ { node: devices.CharDevice, major: 2, minor: 1, }: devices.Permissions("rwm"), { node: devices.BlockDevice, major: 5, minor: devices.Wildcard, }: devices.Permissions("rwm"), }, }, }, { name: "RuleAdditionMergeNoop", rule: devices.Rule{ Type: devices.BlockDevice, Major: 5, Minor: 12, Permissions: devices.Permissions("r"), Allow: !baseDefaultAllow, }, base: &emulator{ defaultAllow: baseDefaultAllow, rules: deviceRules{ { node: devices.CharDevice, major: 2, minor: 1, }: devices.Permissions("rwm"), { node: devices.BlockDevice, major: 5, minor: 12, }: devices.Permissions("rw"), }, }, expected: &emulator{ defaultAllow: baseDefaultAllow, rules: deviceRules{ { node: devices.CharDevice, major: 2, minor: 1, }: devices.Permissions("rwm"), { node: devices.BlockDevice, major: 5, minor: 12, }: devices.Permissions("rw"), }, }, }, // Rule removal logic. { name: "RuleRemovalBasic", rule: devices.Rule{ Type: devices.CharDevice, Major: 42, Minor: 1337, Permissions: devices.Permissions("rm"), Allow: baseDefaultAllow, }, base: &emulator{ defaultAllow: baseDefaultAllow, rules: deviceRules{ { node: devices.CharDevice, major: 42, minor: 1337, }: devices.Permissions("rm"), { node: devices.BlockDevice, major: 1, minor: 5, }: devices.Permissions("r"), }, }, expected: &emulator{ defaultAllow: baseDefaultAllow, rules: deviceRules{ { node: devices.BlockDevice, major: 1, minor: 5, }: devices.Permissions("r"), }, }, }, { name: "RuleRemovalNonexistent", rule: devices.Rule{ Type: devices.CharDevice, Major: 4, Minor: 1, Permissions: devices.Permissions("rw"), Allow: baseDefaultAllow, }, base: &emulator{ defaultAllow: baseDefaultAllow, rules: deviceRules{ { node: devices.BlockDevice, major: 1, minor: 5, }: devices.Permissions("r"), }, }, expected: &emulator{ defaultAllow: baseDefaultAllow, rules: deviceRules{ { node: devices.BlockDevice, major: 1, minor: 5, }: devices.Permissions("r"), }, }, }, { name: "RuleRemovalFull", rule: devices.Rule{ Type: devices.CharDevice, Major: 42, Minor: 1337, Permissions: devices.Permissions("rw"), Allow: baseDefaultAllow, }, base: &emulator{ defaultAllow: baseDefaultAllow, rules: deviceRules{ { node: devices.CharDevice, major: 42, minor: 1337, }: devices.Permissions("w"), { node: devices.BlockDevice, major: 1, minor: 5, }: devices.Permissions("r"), }, }, expected: &emulator{ defaultAllow: baseDefaultAllow, rules: deviceRules{ { node: devices.BlockDevice, major: 1, minor: 5, }: devices.Permissions("r"), }, }, }, { name: "RuleRemovalPartial", rule: devices.Rule{ Type: devices.CharDevice, Major: 42, Minor: 1337, Permissions: devices.Permissions("r"), Allow: baseDefaultAllow, }, base: &emulator{ defaultAllow: baseDefaultAllow, rules: deviceRules{ { node: devices.CharDevice, major: 42, minor: 1337, }: devices.Permissions("rm"), { node: devices.BlockDevice, major: 1, minor: 5, }: devices.Permissions("r"), }, }, expected: &emulator{ defaultAllow: baseDefaultAllow, rules: deviceRules{ { node: devices.CharDevice, major: 42, minor: 1337, }: devices.Permissions("m"), { node: devices.BlockDevice, major: 1, minor: 5, }: devices.Permissions("r"), }, }, }, // Check our non-canonical behaviour when it comes to try to "punch // out" holes in a wildcard rule. { name: "RuleRemovalWildcardPunchoutImpossible", rule: devices.Rule{ Type: devices.CharDevice, Major: 42, Minor: 1337, Permissions: devices.Permissions("r"), Allow: baseDefaultAllow, }, base: &emulator{ defaultAllow: baseDefaultAllow, rules: deviceRules{ { node: devices.CharDevice, major: 42, minor: devices.Wildcard, }: devices.Permissions("rm"), { node: devices.CharDevice, major: 42, minor: 1337, }: devices.Permissions("r"), }, }, expected: nil, }, { name: "RuleRemovalWildcardPunchoutPossible", rule: devices.Rule{ Type: devices.CharDevice, Major: 42, Minor: 1337, Permissions: devices.Permissions("r"), Allow: baseDefaultAllow, }, base: &emulator{ defaultAllow: baseDefaultAllow, rules: deviceRules{ { node: devices.CharDevice, major: 42, minor: devices.Wildcard, }: devices.Permissions("wm"), { node: devices.CharDevice, major: 42, minor: 1337, }: devices.Permissions("r"), }, }, expected: &emulator{ defaultAllow: baseDefaultAllow, rules: deviceRules{ { node: devices.CharDevice, major: 42, minor: devices.Wildcard, }: devices.Permissions("wm"), }, }, }, } for _, test := range tests { test := test t.Run(test.name, func(t *testing.T) { err := test.base.Apply(test.rule) if err != nil && test.expected != nil { t.Fatalf("unexpected failure when applying apply rule: %v", err) } else if err == nil && test.expected == nil { t.Fatalf("unexpected success when applying apply rule: %#v", test.base) } if test.expected != nil && !reflect.DeepEqual(test.base, test.expected) { t.Errorf("final emulator state mismatch: %#v != %#v", test.base, test.expected) } }) } } func TestDeviceEmulatorWhitelistApply(t *testing.T) { testDeviceEmulatorApply(t, false) } func TestDeviceEmulatorBlacklistApply(t *testing.T) { testDeviceEmulatorApply(t, true) } func testDeviceEmulatorTransition(t *testing.T, sourceDefaultAllow bool) { tests := []struct { name string source, target *emulator expected []*devices.Rule }{ // No-op changes. { name: "Noop", source: &emulator{ defaultAllow: sourceDefaultAllow, rules: deviceRules{ { node: devices.CharDevice, major: 42, minor: devices.Wildcard, }: devices.Permissions("wm"), }, }, target: &emulator{ defaultAllow: sourceDefaultAllow, rules: deviceRules{ { node: devices.CharDevice, major: 42, minor: devices.Wildcard, }: devices.Permissions("wm"), }, }, // Identical white-lists produce no extra rules. expected: nil, }, // Switching modes. { name: "SwitchToOtherMode", source: &emulator{ defaultAllow: sourceDefaultAllow, rules: deviceRules{ { node: devices.CharDevice, major: 1, minor: 2, }: devices.Permissions("rwm"), }, }, target: &emulator{ defaultAllow: !sourceDefaultAllow, rules: deviceRules{ { node: devices.BlockDevice, major: 42, minor: devices.Wildcard, }: devices.Permissions("wm"), }, }, expected: []*devices.Rule{ // Clear-all rule. { Type: devices.WildcardDevice, Major: devices.Wildcard, Minor: devices.Wildcard, Permissions: devices.Permissions("rwm"), Allow: !sourceDefaultAllow, }, // The actual rule-set. { Type: devices.BlockDevice, Major: 42, Minor: devices.Wildcard, Permissions: devices.Permissions("wm"), Allow: sourceDefaultAllow, }, }, }, // Rule changes. { name: "RuleAddition", source: &emulator{ defaultAllow: sourceDefaultAllow, rules: deviceRules{ { node: devices.CharDevice, major: 1, minor: 2, }: devices.Permissions("rwm"), }, }, target: &emulator{ defaultAllow: sourceDefaultAllow, rules: deviceRules{ { node: devices.CharDevice, major: 1, minor: 2, }: devices.Permissions("rwm"), { node: devices.BlockDevice, major: 42, minor: 1337, }: devices.Permissions("rwm"), }, }, expected: []*devices.Rule{ { Type: devices.BlockDevice, Major: 42, Minor: 1337, Permissions: devices.Permissions("rwm"), Allow: !sourceDefaultAllow, }, }, }, { name: "RuleRemoval", source: &emulator{ defaultAllow: sourceDefaultAllow, rules: deviceRules{ { node: devices.CharDevice, major: 1, minor: 2, }: devices.Permissions("rwm"), { node: devices.BlockDevice, major: 42, minor: 1337, }: devices.Permissions("rwm"), }, }, target: &emulator{ defaultAllow: sourceDefaultAllow, rules: deviceRules{ { node: devices.CharDevice, major: 1, minor: 2, }: devices.Permissions("rwm"), }, }, expected: []*devices.Rule{ { Type: devices.BlockDevice, Major: 42, Minor: 1337, Permissions: devices.Permissions("rwm"), Allow: sourceDefaultAllow, }, }, }, { name: "RuleMultipleAdditionRemoval", source: &emulator{ defaultAllow: sourceDefaultAllow, rules: deviceRules{ { node: devices.CharDevice, major: 1, minor: 2, }: devices.Permissions("rwm"), { node: devices.BlockDevice, major: 3, minor: 9, }: devices.Permissions("rw"), }, }, target: &emulator{ defaultAllow: sourceDefaultAllow, rules: deviceRules{ { node: devices.CharDevice, major: 1, minor: 2, }: devices.Permissions("rwm"), }, }, expected: []*devices.Rule{ { Type: devices.BlockDevice, Major: 3, Minor: 9, Permissions: devices.Permissions("rw"), Allow: sourceDefaultAllow, }, }, }, // Modifying the access permissions. { name: "RulePartialAddition", source: &emulator{ defaultAllow: sourceDefaultAllow, rules: deviceRules{ { node: devices.CharDevice, major: 1, minor: 2, }: devices.Permissions("r"), }, }, target: &emulator{ defaultAllow: sourceDefaultAllow, rules: deviceRules{ { node: devices.CharDevice, major: 1, minor: 2, }: devices.Permissions("rwm"), }, }, expected: []*devices.Rule{ { Type: devices.CharDevice, Major: 1, Minor: 2, Permissions: devices.Permissions("wm"), Allow: !sourceDefaultAllow, }, }, }, { name: "RulePartialRemoval", source: &emulator{ defaultAllow: sourceDefaultAllow, rules: deviceRules{ { node: devices.CharDevice, major: 1, minor: 2, }: devices.Permissions("rw"), }, }, target: &emulator{ defaultAllow: sourceDefaultAllow, rules: deviceRules{ { node: devices.CharDevice, major: 1, minor: 2, }: devices.Permissions("w"), }, }, expected: []*devices.Rule{ { Type: devices.CharDevice, Major: 1, Minor: 2, Permissions: devices.Permissions("r"), Allow: sourceDefaultAllow, }, }, }, { name: "RulePartialBoth", source: &emulator{ defaultAllow: sourceDefaultAllow, rules: deviceRules{ { node: devices.CharDevice, major: 1, minor: 2, }: devices.Permissions("rw"), }, }, target: &emulator{ defaultAllow: sourceDefaultAllow, rules: deviceRules{ { node: devices.CharDevice, major: 1, minor: 2, }: devices.Permissions("rm"), }, }, expected: []*devices.Rule{ { Type: devices.CharDevice, Major: 1, Minor: 2, Permissions: devices.Permissions("w"), Allow: sourceDefaultAllow, }, { Type: devices.CharDevice, Major: 1, Minor: 2, Permissions: devices.Permissions("m"), Allow: !sourceDefaultAllow, }, }, }, } for _, test := range tests { test := test t.Run(test.name, func(t *testing.T) { // If we are in black-list mode, we need to prepend the relevant // clear-all rule (the expected rule lists are written with // white-list mode in mind), and then make a full copy of the // target rules. if sourceDefaultAllow && test.source.defaultAllow == test.target.defaultAllow { test.expected = []*devices.Rule{{ Type: devices.WildcardDevice, Major: devices.Wildcard, Minor: devices.Wildcard, Permissions: devices.Permissions("rwm"), Allow: test.target.defaultAllow, }} for _, rule := range test.target.rules.orderedEntries() { test.expected = append(test.expected, &devices.Rule{ Type: rule.meta.node, Major: rule.meta.major, Minor: rule.meta.minor, Permissions: rule.perms, Allow: !test.target.defaultAllow, }) } } rules, err := test.source.Transition(test.target) if err != nil { t.Fatalf("unexpected error while calculating transition rules: %#v", err) } if !reflect.DeepEqual(rules, test.expected) { t.Errorf("rules don't match expected set: %#v != %#v", rules, test.expected) } // Apply the rules to the source to see if it actually transitions // correctly. This is all emulated but it's a good thing to // double-check. for _, rule := range rules { if err := test.source.Apply(*rule); err != nil { t.Fatalf("error while applying transition rule [%#v]: %v", rule, err) } } if !reflect.DeepEqual(test.source, test.target) { t.Errorf("transition incomplete after applying all rules: %#v != %#v", test.source, test.target) } }) } } func TestDeviceEmulatorTransitionFromBlacklist(t *testing.T) { testDeviceEmulatorTransition(t, true) } func TestDeviceEmulatorTransitionFromWhitelist(t *testing.T) { testDeviceEmulatorTransition(t, false) } func BenchmarkParseLine(b *testing.B) { list := `c *:* m b *:* m c 1:3 rwm c 1:5 rwm c 1:7 rwm c 1:8 rwm c 1:9 rwm c 5:0 rwm c 5:2 rwm c 136:* rwm c 10:200 rwm` var r *deviceRule var err error for i := 0; i < b.N; i++ { s := bufio.NewScanner(strings.NewReader(list)) for s.Scan() { line := s.Text() r, err = parseLine(line) } if err := s.Err(); err != nil { b.Fatal(err) } } b.Logf("rule: %v, err: %v", r, err) } cgroups-0.0.4/devices/ebpf_linux.go000066400000000000000000000176261503527177300173270ustar00rootroot00000000000000package devices import ( "errors" "fmt" "os" "runtime" "sync" "unsafe" "github.com/cilium/ebpf" "github.com/cilium/ebpf/asm" "github.com/cilium/ebpf/link" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) func nilCloser() error { return nil } func findAttachedCgroupDeviceFilters(dirFd int) ([]*ebpf.Program, error) { type bpfAttrQuery struct { TargetFd uint32 AttachType uint32 QueryType uint32 AttachFlags uint32 ProgIds uint64 // __aligned_u64 ProgCnt uint32 } // Currently you can only have 64 eBPF programs attached to a cgroup. size := 64 retries := 0 for retries < 10 { progIds := make([]uint32, size) query := bpfAttrQuery{ TargetFd: uint32(dirFd), AttachType: uint32(unix.BPF_CGROUP_DEVICE), ProgIds: uint64(uintptr(unsafe.Pointer(&progIds[0]))), ProgCnt: uint32(len(progIds)), } // Fetch the list of program ids. _, _, errno := unix.Syscall(unix.SYS_BPF, uintptr(unix.BPF_PROG_QUERY), uintptr(unsafe.Pointer(&query)), unsafe.Sizeof(query)) size = int(query.ProgCnt) runtime.KeepAlive(query) if errno != 0 { // On ENOSPC we get the correct number of programs. if errno == unix.ENOSPC { retries++ continue } return nil, fmt.Errorf("bpf_prog_query(BPF_CGROUP_DEVICE) failed: %w", errno) } // Convert the ids to program handles. progIds = progIds[:size] programs := make([]*ebpf.Program, 0, len(progIds)) for _, progId := range progIds { program, err := ebpf.NewProgramFromID(ebpf.ProgramID(progId)) if err != nil { // We skip over programs that give us -EACCES or -EPERM. This // is necessary because there may be BPF programs that have // been attached (such as with --systemd-cgroup) which have an // LSM label that blocks us from interacting with the program. // // Because additional BPF_CGROUP_DEVICE programs only can add // restrictions, there's no real issue with just ignoring these // programs (and stops runc from breaking on distributions with // very strict SELinux policies). if errors.Is(err, os.ErrPermission) { logrus.Debugf("ignoring existing CGROUP_DEVICE program (prog_id=%v) which cannot be accessed by runc -- likely due to LSM policy: %v", progId, err) continue } return nil, fmt.Errorf("cannot fetch program from id: %w", err) } programs = append(programs, program) } runtime.KeepAlive(progIds) return programs, nil } return nil, errors.New("could not get complete list of CGROUP_DEVICE programs") } var ( haveBpfProgReplaceBool bool haveBpfProgReplaceOnce sync.Once ) // Loosely based on the BPF_F_REPLACE support check in // https://github.com/cilium/ebpf/blob/v0.6.0/link/syscalls.go. // // TODO: move this logic to cilium/ebpf func haveBpfProgReplace() bool { haveBpfProgReplaceOnce.Do(func() { prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{ Type: ebpf.CGroupDevice, License: "MIT", Instructions: asm.Instructions{ asm.Mov.Imm(asm.R0, 0), asm.Return(), }, }) if err != nil { logrus.Warnf("checking for BPF_F_REPLACE support: ebpf.NewProgram failed: %v", err) return } defer prog.Close() devnull, err := os.Open("/dev/null") if err != nil { logrus.Warnf("checking for BPF_F_REPLACE support: open dummy target fd: %v", err) return } defer devnull.Close() // We know that we have BPF_PROG_ATTACH since we can load // BPF_CGROUP_DEVICE programs. If passing BPF_F_REPLACE gives us EINVAL // we know that the feature isn't present. err = link.RawAttachProgram(link.RawAttachProgramOptions{ // We rely on this fd being checked after attachFlags in the kernel. Target: int(devnull.Fd()), // Attempt to "replace" our BPF program with itself. This will // always fail, but we should get -EINVAL if BPF_F_REPLACE is not // supported. Anchor: link.ReplaceProgram(prog), Program: prog, Attach: ebpf.AttachCGroupDevice, Flags: unix.BPF_F_ALLOW_MULTI, }) if errors.Is(err, ebpf.ErrNotSupported) || errors.Is(err, unix.EINVAL) { // not supported return } if !errors.Is(err, unix.EBADF) { // If we see any new errors here, it's possible that there is a // regression due to a cilium/ebpf update and the above EINVAL // checks are not working. So, be loud about it so someone notices // and we can get the issue fixed quicker. logrus.Warnf("checking for BPF_F_REPLACE: got unexpected (not EBADF or EINVAL) error: %v", err) } haveBpfProgReplaceBool = true }) return haveBpfProgReplaceBool } // loadAttachCgroupDeviceFilter installs eBPF device filter program to /sys/fs/cgroup/ directory. // // Requires the system to be running in cgroup2 unified-mode with kernel >= 4.15 . // // https://github.com/torvalds/linux/commit/ebc614f687369f9df99828572b1d85a7c2de3d92 func loadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFd int) (func() error, error) { // Increase `ulimit -l` limit to avoid BPF_PROG_LOAD error (#2167). // This limit is not inherited into the container. memlockLimit := &unix.Rlimit{ Cur: unix.RLIM_INFINITY, Max: unix.RLIM_INFINITY, } _ = unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit) // Get the list of existing programs. oldProgs, err := findAttachedCgroupDeviceFilters(dirFd) if err != nil { return nilCloser, err } useReplaceProg := haveBpfProgReplace() && len(oldProgs) == 1 // Generate new program. spec := &ebpf.ProgramSpec{ Type: ebpf.CGroupDevice, Instructions: insts, License: license, } prog, err := ebpf.NewProgram(spec) if err != nil { return nilCloser, err } // If there is only one old program, we can just replace it directly. attachProgramOptions := link.RawAttachProgramOptions{ Target: dirFd, Program: prog, Attach: ebpf.AttachCGroupDevice, Flags: unix.BPF_F_ALLOW_MULTI, } if useReplaceProg { attachProgramOptions.Anchor = link.ReplaceProgram(oldProgs[0]) } err = link.RawAttachProgram(attachProgramOptions) if err != nil { return nilCloser, fmt.Errorf("failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI): %w", err) } closer := func() error { err = link.RawDetachProgram(link.RawDetachProgramOptions{ Target: dirFd, Program: prog, Attach: ebpf.AttachCGroupDevice, }) if err != nil { return fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE): %w", err) } // TODO: Should we attach the old filters back in this case? Otherwise // we fail-open on a security feature, which is a bit scary. return nil } if !useReplaceProg { logLevel := logrus.DebugLevel // If there was more than one old program, give a warning (since this // really shouldn't happen with runc-managed cgroups) and then detach // all the old programs. if len(oldProgs) > 1 { // NOTE: Ideally this should be a warning but it turns out that // systemd-managed cgroups trigger this warning (apparently // systemd doesn't delete old non-systemd programs when // setting properties). logrus.Infof("found more than one filter (%d) attached to a cgroup -- removing extra filters!", len(oldProgs)) logLevel = logrus.InfoLevel } for idx, oldProg := range oldProgs { // Output some extra debug info. if info, err := oldProg.Info(); err == nil { fields := logrus.Fields{ "type": info.Type.String(), "tag": info.Tag, "name": info.Name, } if id, ok := info.ID(); ok { fields["id"] = id } if runCount, ok := info.RunCount(); ok { fields["run_count"] = runCount } if runtime, ok := info.Runtime(); ok { fields["runtime"] = runtime.String() } logrus.WithFields(fields).Logf(logLevel, "removing old filter %d from cgroup", idx) } err = link.RawDetachProgram(link.RawDetachProgramOptions{ Target: dirFd, Program: oldProg, Attach: ebpf.AttachCGroupDevice, }) if err != nil { return closer, fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE) on old filter program: %w", err) } } } return closer, nil } cgroups-0.0.4/devices/systemd.go000066400000000000000000000201611503527177300166500ustar00rootroot00000000000000package devices import ( "bufio" "fmt" "os" "strconv" "strings" systemdDbus "github.com/coreos/go-systemd/v22/dbus" "github.com/godbus/dbus/v5" "github.com/sirupsen/logrus" "github.com/opencontainers/cgroups" devices "github.com/opencontainers/cgroups/devices/config" ) // systemdProperties takes the configured device rules and generates a // corresponding set of systemd properties to configure the devices correctly. func systemdProperties(r *cgroups.Resources, sdVer int) ([]systemdDbus.Property, error) { if r.SkipDevices { return nil, nil } properties := []systemdDbus.Property{ // When we later add DeviceAllow=/dev/foo properties, we are // appending devices to the allow list for the unit. However, // if this is an existing unit, it already has DeviceAllow= // entries, and we need to clear them all before applying the // new set. (We also do this for new units, mainly for safety // to ensure we only enable the devices we expect.) // // To clear any existing DeviceAllow= rules, we have to add an // empty DeviceAllow= property. newProp("DeviceAllow", []deviceAllowEntry{}), // Always run in the strictest white-list mode. newProp("DevicePolicy", "strict"), } // Figure out the set of rules. configEmu := emulator{} for _, rule := range r.Devices { if err := configEmu.Apply(*rule); err != nil { return nil, fmt.Errorf("unable to apply rule for systemd: %w", err) } } // systemd doesn't support blacklists. So we log a warning, and tell // systemd to act as a deny-all whitelist. This ruleset will be replaced // with our normal fallback code. This may result in spurious errors, but // the only other option is to error out here. if configEmu.IsBlacklist() { // However, if we're dealing with an allow-all rule then we can do it. if configEmu.IsAllowAll() { return allowAllDevices(), nil } logrus.Warn("systemd doesn't support blacklist device rules -- applying temporary deny-all rule") return properties, nil } // Now generate the set of rules we actually need to apply. Unlike the // normal devices cgroup, in "strict" mode systemd defaults to a deny-all // whitelist which is the default for devices.Emulator. finalRules, err := configEmu.Rules() if err != nil { return nil, fmt.Errorf("unable to get simplified rules for systemd: %w", err) } var deviceAllowList []deviceAllowEntry for _, rule := range finalRules { if !rule.Allow { // Should never happen. return nil, fmt.Errorf("[internal error] cannot add deny rule to systemd DeviceAllow list: %v", *rule) } switch rule.Type { case devices.BlockDevice, devices.CharDevice: default: // Should never happen. return nil, fmt.Errorf("invalid device type for DeviceAllow: %v", rule.Type) } entry := deviceAllowEntry{ Perms: string(rule.Permissions), } // systemd has a fairly odd (though understandable) syntax here, and // because of the OCI configuration format we have to do quite a bit of // trickery to convert things: // // * Concrete rules with non-wildcard major/minor numbers have to use // /dev/{block,char}/MAJOR:minor paths. Before v240, systemd uses // stat(2) on such paths to look up device properties, meaning we // cannot add whitelist rules for devices that don't exist. Since v240, // device properties are parsed from the path string. // // However, path globbing is not supported for path-based rules so we // need to handle wildcards in some other manner. // // * If systemd older than v240 is used, wildcard-minor rules // have to specify a "device group name" (the second column // in /proc/devices). // // * Wildcard (major and minor) rules can just specify a glob with the // type ("char-*" or "block-*"). // // The only type of rule we can't handle is wildcard-major rules, and // so we'll give a warning in that case (note that the fallback code // will insert any rules systemd couldn't handle). What amazing fun. if rule.Major == devices.Wildcard { // "_ *:n _" rules aren't supported by systemd. if rule.Minor != devices.Wildcard { logrus.Warnf("systemd doesn't support '*:n' device rules -- temporarily ignoring rule: %v", *rule) continue } // "_ *:* _" rules just wildcard everything. prefix, err := groupPrefix(rule.Type) if err != nil { return nil, err } entry.Path = prefix + "*" } else if rule.Minor == devices.Wildcard { if sdVer >= 240 { // systemd v240+ allows for {block,char}-MAJOR syntax. prefix, err := groupPrefix(rule.Type) if err != nil { return nil, err } entry.Path = prefix + strconv.FormatInt(rule.Major, 10) } else { // For older systemd, "_ n:* _" rules require a device group from /proc/devices. group, err := findDeviceGroup(rule.Type, rule.Major) if err != nil { return nil, fmt.Errorf("unable to find device '%v/%d': %w", rule.Type, rule.Major, err) } if group == "" { // Couldn't find a group. logrus.Warnf("could not find device group for '%v/%d' in /proc/devices -- temporarily ignoring rule: %v", rule.Type, rule.Major, *rule) continue } entry.Path = group } } else { // "_ n:m _" rules are just a path in /dev/{block,char}/. switch rule.Type { case devices.BlockDevice: entry.Path = fmt.Sprintf("/dev/block/%d:%d", rule.Major, rule.Minor) case devices.CharDevice: entry.Path = fmt.Sprintf("/dev/char/%d:%d", rule.Major, rule.Minor) } if sdVer < 240 { // Old systemd versions use stat(2) on path to find out device major:minor // numbers and type. If the path doesn't exist, it will not add the rule, // emitting a warning instead. // Since all of this logic is best-effort anyway (we manually set these // rules separately to systemd) we can safely skip entries that don't // have a corresponding path. if _, err := os.Stat(entry.Path); err != nil { continue } } } deviceAllowList = append(deviceAllowList, entry) } properties = append(properties, newProp("DeviceAllow", deviceAllowList)) return properties, nil } func newProp(name string, units any) systemdDbus.Property { return systemdDbus.Property{ Name: name, Value: dbus.MakeVariant(units), } } func groupPrefix(ruleType devices.Type) (string, error) { switch ruleType { case devices.BlockDevice: return "block-", nil case devices.CharDevice: return "char-", nil default: return "", fmt.Errorf("device type %v has no group prefix", ruleType) } } // findDeviceGroup tries to find the device group name (as listed in // /proc/devices) with the type prefixed as required for DeviceAllow, for a // given (type, major) combination. If more than one device group exists, an // arbitrary one is chosen. func findDeviceGroup(ruleType devices.Type, ruleMajor int64) (string, error) { fh, err := os.Open("/proc/devices") if err != nil { return "", err } defer fh.Close() prefix, err := groupPrefix(ruleType) if err != nil { return "", err } ruleMajorStr := strconv.FormatInt(ruleMajor, 10) + " " scanner := bufio.NewScanner(fh) var currentType devices.Type for scanner.Scan() { // We need to strip spaces because the first number is column-aligned. line := strings.TrimSpace(scanner.Text()) // Handle the "header" lines. switch line { case "Block devices:": currentType = devices.BlockDevice continue case "Character devices:": currentType = devices.CharDevice continue case "": continue } // Skip lines unrelated to our type. if currentType != ruleType { continue } if group, ok := strings.CutPrefix(line, ruleMajorStr); ok { return prefix + group, nil } } if err := scanner.Err(); err != nil { return "", fmt.Errorf("reading /proc/devices: %w", err) } // Couldn't find the device group. return "", nil } // DeviceAllow is the dbus type "a(ss)" which means we need a struct // to represent it in Go. type deviceAllowEntry struct { Path string Perms string } func allowAllDevices() []systemdDbus.Property { // Setting mode to auto and removing all DeviceAllow rules // results in allowing access to all devices. return []systemdDbus.Property{ newProp("DeviceAllow", []deviceAllowEntry{}), newProp("DevicePolicy", "auto"), } } cgroups-0.0.4/devices/systemd_test.go000066400000000000000000000161101503527177300177060ustar00rootroot00000000000000package devices import ( "bytes" "fmt" "os" "os/exec" "strings" "testing" "github.com/opencontainers/cgroups" devices "github.com/opencontainers/cgroups/devices/config" "github.com/opencontainers/cgroups/systemd" ) // TestPodSkipDevicesUpdate checks that updating a pod having SkipDevices: true // does not result in spurious "permission denied" errors in a container // running under the pod. The test is somewhat similar in nature to the // @test "update devices [minimal transition rules]" in tests/integration, // but uses a pod. func TestPodSkipDevicesUpdate(t *testing.T) { if !systemd.IsRunningSystemd() { t.Skip("Test requires systemd.") } if os.Geteuid() != 0 { t.Skip("Test requires root.") } podName := "system-runc_test_pod" + t.Name() + ".slice" podConfig := &cgroups.Cgroup{ Systemd: true, Parent: "system.slice", Name: podName, Resources: &cgroups.Resources{ PidsLimit: 42, Memory: 32 * 1024 * 1024, SkipDevices: true, }, } // Create "pod" cgroup (a systemd slice to hold containers). pm := newManager(t, podConfig) if err := pm.Apply(-1); err != nil { t.Fatal(err) } if err := pm.Set(podConfig.Resources); err != nil { t.Fatal(err) } containerConfig := &cgroups.Cgroup{ Parent: podName, ScopePrefix: "test", Name: "PodSkipDevicesUpdate", Resources: &cgroups.Resources{ Devices: []*devices.Rule{ // Allow access to /dev/null. { Type: devices.CharDevice, Major: 1, Minor: 3, Permissions: "rwm", Allow: true, }, }, }, } // Create a "container" within the "pod" cgroup. // This is not a real container, just a process in the cgroup. cmd := exec.Command("sleep", "infinity") cmd.Env = append(os.Environ(), "LANG=C") var stderr bytes.Buffer cmd.Stderr = &stderr if err := cmd.Start(); err != nil { t.Fatal(err) } // Make sure to not leave a zombie. defer func() { // These may fail, we don't care. _ = cmd.Process.Kill() _ = cmd.Wait() }() // Put the process into a cgroup. cm := newManager(t, containerConfig) if err := cm.Apply(cmd.Process.Pid); err != nil { t.Fatal(err) } // Check that we put the "container" into the "pod" cgroup. if !strings.HasPrefix(cm.Path("devices"), pm.Path("devices")) { t.Fatalf("expected container cgroup path %q to be under pod cgroup path %q", cm.Path("devices"), pm.Path("devices")) } if err := cm.Set(containerConfig.Resources); err != nil { t.Fatal(err) } // Now update the pod a few times. for range 42 { podConfig.Resources.PidsLimit++ podConfig.Resources.Memory += 1024 * 1024 if err := pm.Set(podConfig.Resources); err != nil { t.Fatal(err) } } // Kill the "container". if err := cmd.Process.Kill(); err != nil { t.Fatal(err) } _ = cmd.Wait() // "Container" stderr should be empty. if stderr.Len() != 0 { t.Fatalf("container stderr not empty: %s", stderr.String()) } } func testSkipDevices(t *testing.T, skipDevices bool, expected []string) { if !systemd.IsRunningSystemd() { t.Skip("Test requires systemd.") } if os.Geteuid() != 0 { t.Skip("Test requires root.") } podConfig := &cgroups.Cgroup{ Parent: "system.slice", Name: "system-runc_test_pods.slice", Resources: &cgroups.Resources{ SkipDevices: skipDevices, }, } // Create "pods" cgroup (a systemd slice to hold containers). pm := newManager(t, podConfig) if err := pm.Apply(-1); err != nil { t.Fatal(err) } if err := pm.Set(podConfig.Resources); err != nil { t.Fatal(err) } config := &cgroups.Cgroup{ Parent: "system-runc_test_pods.slice", ScopePrefix: "test", Name: "SkipDevices", Resources: &cgroups.Resources{ Devices: []*devices.Rule{ // Allow access to /dev/full only. { Type: devices.CharDevice, Major: 1, Minor: 7, Permissions: "rwm", Allow: true, }, }, }, } // Create a "container" within the "pods" cgroup. // This is not a real container, just a process in the cgroup. cmd := exec.Command("bash", "-c", "read; echo > /dev/full; cat /dev/null; true") cmd.Env = append(os.Environ(), "LANG=C") stdinR, stdinW, err := os.Pipe() if err != nil { t.Fatal(err) } cmd.Stdin = stdinR var stderr bytes.Buffer cmd.Stderr = &stderr err = cmd.Start() stdinR.Close() defer stdinW.Close() if err != nil { t.Fatal(err) } // Make sure to not leave a zombie. defer func() { // These may fail, we don't care. _, _ = stdinW.WriteString("hey\n") _ = cmd.Wait() }() // Put the process into a cgroup. m := newManager(t, config) if err := m.Apply(cmd.Process.Pid); err != nil { t.Fatal(err) } // Check that we put the "container" into the "pod" cgroup. if !strings.HasPrefix(m.Path("devices"), pm.Path("devices")) { t.Fatalf("expected container cgroup path %q to be under pod cgroup path %q", m.Path("devices"), pm.Path("devices")) } if err := m.Set(config.Resources); err != nil { // failed to write "c 1:7 rwm": write /sys/fs/cgroup/devices/system.slice/system-runc_test_pods.slice/test-SkipDevices.scope/devices.allow: operation not permitted if skipDevices == false && strings.HasSuffix(err.Error(), "/devices.allow: operation not permitted") { // Cgroup v1 devices controller gives EPERM on trying // to enable devices that are not enabled // (skipDevices=false) in a parent cgroup. // If this happens, test is passing. return } t.Fatal(err) } // Check that we can access /dev/full but not /dev/zero. if _, err := stdinW.WriteString("wow\n"); err != nil { t.Fatal(err) } if err := cmd.Wait(); err != nil { t.Fatal(err) } for _, exp := range expected { if !strings.Contains(stderr.String(), exp) { t.Errorf("expected %q, got: %s", exp, stderr.String()) } } } func TestSkipDevicesTrue(t *testing.T) { testSkipDevices(t, true, []string{ "echo: write error: No space left on device", "cat: /dev/null: Operation not permitted", }) } func TestSkipDevicesFalse(t *testing.T) { // If SkipDevices is not set for the parent slice, access to both // devices should fail. This is done to assess the test correctness. // For cgroup v1, we check for m.Set returning EPERM. // For cgroup v2, we check for the errors below. testSkipDevices(t, false, []string{ "/dev/full: Operation not permitted", "cat: /dev/null: Operation not permitted", }) } func testFindDeviceGroup() error { const ( major = 136 group = "char-pts" ) res, err := findDeviceGroup(devices.CharDevice, major) if res != group || err != nil { return fmt.Errorf("expected %v, nil, got %v, %w", group, res, err) } return nil } func TestFindDeviceGroup(t *testing.T) { if err := testFindDeviceGroup(); err != nil { t.Fatal(err) } } func BenchmarkFindDeviceGroup(b *testing.B) { for i := 0; i < b.N; i++ { if err := testFindDeviceGroup(); err != nil { b.Fatal(err) } } } func newManager(t *testing.T, config *cgroups.Cgroup) (m cgroups.Manager) { t.Helper() var err error if cgroups.IsCgroup2UnifiedMode() { m, err = systemd.NewUnifiedManager(config, "") } else { m, err = systemd.NewLegacyManager(config, nil) } if err != nil { t.Fatal(err) } t.Cleanup(func() { _ = m.Destroy() }) return m } cgroups-0.0.4/devices/v1.go000066400000000000000000000042301503527177300155050ustar00rootroot00000000000000package devices import ( "bytes" "errors" "reflect" "github.com/moby/sys/userns" "github.com/opencontainers/cgroups" devices "github.com/opencontainers/cgroups/devices/config" ) var testingSkipFinalCheck bool func setV1(path string, r *cgroups.Resources) error { if userns.RunningInUserNS() || r.SkipDevices { return nil } // Generate two emulators, one for the current state of the cgroup and one // for the requested state by the user. current, err := loadEmulator(path) if err != nil { return err } target, err := buildEmulator(r.Devices) if err != nil { return err } // Compute the minimal set of transition rules needed to achieve the // requested state. transitionRules, err := current.Transition(target) if err != nil { return err } for _, rule := range transitionRules { file := "devices.deny" if rule.Allow { file = "devices.allow" } if err := cgroups.WriteFile(path, file, rule.CgroupString()); err != nil { return err } } // Final safety check -- ensure that the resulting state is what was // requested. This is only really correct for white-lists, but for // black-lists we can at least check that the cgroup is in the right mode. // // This safety-check is skipped for the unit tests because we cannot // currently mock devices.list correctly. if !testingSkipFinalCheck { currentAfter, err := loadEmulator(path) if err != nil { return err } if !target.IsBlacklist() && !reflect.DeepEqual(currentAfter, target) { return errors.New("resulting devices cgroup doesn't precisely match target") } else if target.IsBlacklist() != currentAfter.IsBlacklist() { return errors.New("resulting devices cgroup doesn't match target mode") } } return nil } func loadEmulator(path string) (*emulator, error) { list, err := cgroups.ReadFile(path, "devices.list") if err != nil { return nil, err } return emulatorFromList(bytes.NewBufferString(list)) } func buildEmulator(rules []*devices.Rule) (*emulator, error) { // This defaults to a white-list -- which is what we want! emu := &emulator{} for _, rule := range rules { if err := emu.Apply(*rule); err != nil { return nil, err } } return emu, nil } cgroups-0.0.4/devices/v1_test.go000066400000000000000000000026501503527177300165500ustar00rootroot00000000000000package devices import ( "os" "path" "testing" "github.com/moby/sys/userns" "github.com/opencontainers/cgroups" devices "github.com/opencontainers/cgroups/devices/config" "github.com/opencontainers/cgroups/fscommon" ) func init() { testingSkipFinalCheck = true cgroups.TestMode = true } func TestSetV1Allow(t *testing.T) { if userns.RunningInUserNS() { t.Skip("userns detected; setV1 does nothing") } dir := t.TempDir() for file, contents := range map[string]string{ "devices.allow": "", "devices.deny": "", "devices.list": "a *:* rwm", } { err := os.WriteFile(path.Join(dir, file), []byte(contents), 0o600) if err != nil { t.Fatal(err) } } r := &cgroups.Resources{ Devices: []*devices.Rule{ { Type: devices.CharDevice, Major: 1, Minor: 5, Permissions: devices.Permissions("rwm"), Allow: true, }, }, } if err := setV1(dir, r); err != nil { t.Fatal(err) } // The default deny rule must be written. value, err := fscommon.GetCgroupParamString(dir, "devices.deny") if err != nil { t.Fatal(err) } if value[0] != 'a' { t.Errorf("Got the wrong value (%q), set devices.deny failed.", value) } // Permitted rule must be written. if value, err := fscommon.GetCgroupParamString(dir, "devices.allow"); err != nil { t.Fatal(err) } else if value != "c 1:5 rwm" { t.Errorf("Got the wrong value (%q), set devices.allow failed.", value) } } cgroups-0.0.4/devices/v2.go000066400000000000000000000036251503527177300155150ustar00rootroot00000000000000package devices import ( "fmt" "github.com/moby/sys/userns" "golang.org/x/sys/unix" "github.com/opencontainers/cgroups" devices "github.com/opencontainers/cgroups/devices/config" ) func isRWM(perms devices.Permissions) bool { var r, w, m bool for _, perm := range perms { switch perm { case 'r': r = true case 'w': w = true case 'm': m = true } } return r && w && m } // This is similar to the logic applied in crun for handling errors from bpf(2) // . func canSkipEBPFError(r *cgroups.Resources) bool { // If we're running in a user namespace we can ignore eBPF rules because we // usually cannot use bpf(2), as well as rootless containers usually don't // have the necessary privileges to mknod(2) device inodes or access // host-level instances (though ideally we would be blocking device access // for rootless containers anyway). if userns.RunningInUserNS() { return true } // We cannot ignore an eBPF load error if any rule if is a block rule or it // doesn't permit all access modes. // // NOTE: This will sometimes trigger in cases where access modes are split // between different rules but to handle this correctly would require // using ".../libcontainer/cgroup/devices".Emulator. for _, dev := range r.Devices { if !dev.Allow || !isRWM(dev.Permissions) { return false } } return true } func setV2(dirPath string, r *cgroups.Resources) error { if r.SkipDevices { return nil } insts, license, err := deviceFilter(r.Devices) if err != nil { return err } dirFD, err := unix.Open(dirPath, unix.O_DIRECTORY|unix.O_RDONLY, 0o600) if err != nil { return fmt.Errorf("cannot get dir FD for %s", dirPath) } defer unix.Close(dirFD) if _, err := loadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil { if !canSkipEBPFError(r) { return err } } return nil } cgroups-0.0.4/file.go000066400000000000000000000133561503527177300144650ustar00rootroot00000000000000package cgroups import ( "bytes" "errors" "fmt" "os" "path/filepath" "strconv" "strings" "sync" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) // OpenFile opens a cgroup file in a given dir with given flags. // It is supposed to be used for cgroup files only, and returns // an error if the file is not a cgroup file. // // Arguments dir and file are joined together to form an absolute path // to a file being opened. func OpenFile(dir, file string, flags int) (*os.File, error) { if dir == "" { return nil, fmt.Errorf("no directory specified for %s", file) } return openFile(dir, file, flags) } // ReadFile reads data from a cgroup file in dir. // It is supposed to be used for cgroup files only. func ReadFile(dir, file string) (string, error) { fd, err := OpenFile(dir, file, unix.O_RDONLY) if err != nil { return "", err } defer fd.Close() var buf bytes.Buffer _, err = buf.ReadFrom(fd) return buf.String(), err } // WriteFile writes data to a cgroup file in dir. // It is supposed to be used for cgroup files only. func WriteFile(dir, file, data string) error { fd, err := OpenFile(dir, file, unix.O_WRONLY) if err != nil { return err } defer fd.Close() if _, err := fd.WriteString(data); err != nil { // Having data in the error message helps in debugging. return fmt.Errorf("failed to write %q: %w", data, err) } return nil } // WriteFileByLine is the same as WriteFile, except if data contains newlines, // it is written line by line. func WriteFileByLine(dir, file, data string) error { i := strings.Index(data, "\n") if i == -1 { return WriteFile(dir, file, data) } fd, err := OpenFile(dir, file, unix.O_WRONLY) if err != nil { return err } defer fd.Close() start := 0 for { var line string if i == -1 { line = data[start:] } else { line = data[start : start+i+1] } _, err := fd.WriteString(line) if err != nil { return fmt.Errorf("failed to write %q: %w", line, err) } if i == -1 { break } start += i + 1 i = strings.Index(data[start:], "\n") } return nil } const ( cgroupfsDir = "/sys/fs/cgroup" cgroupfsPrefix = cgroupfsDir + "/" ) var ( // TestMode is set to true by unit tests that need "fake" cgroupfs. TestMode bool cgroupRootHandle *os.File prepOnce sync.Once prepErr error resolveFlags uint64 ) func prepareOpenat2() error { prepOnce.Do(func() { fd, err := unix.Openat2(-1, cgroupfsDir, &unix.OpenHow{ Flags: unix.O_DIRECTORY | unix.O_PATH | unix.O_CLOEXEC, }) if err != nil { prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err} if err != unix.ENOSYS { logrus.Warnf("falling back to securejoin: %s", prepErr) } else { logrus.Debug("openat2 not available, falling back to securejoin") } return } file := os.NewFile(uintptr(fd), cgroupfsDir) var st unix.Statfs_t if err := unix.Fstatfs(int(file.Fd()), &st); err != nil { prepErr = &os.PathError{Op: "statfs", Path: cgroupfsDir, Err: err} logrus.Warnf("falling back to securejoin: %s", prepErr) return } cgroupRootHandle = file resolveFlags = unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS if st.Type == unix.CGROUP2_SUPER_MAGIC { // cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks resolveFlags |= unix.RESOLVE_NO_XDEV | unix.RESOLVE_NO_SYMLINKS } }) return prepErr } func openFile(dir, file string, flags int) (*os.File, error) { mode := os.FileMode(0) if TestMode && flags&os.O_WRONLY != 0 { // "emulate" cgroup fs for unit tests flags |= os.O_TRUNC | os.O_CREATE mode = 0o600 } // NOTE it is important to use filepath.Clean("/"+file) here // (see https://github.com/opencontainers/runc/issues/4103)! path := filepath.Join(dir, filepath.Clean("/"+file)) if prepareOpenat2() != nil { return openFallback(path, flags, mode) } relPath, ok := strings.CutPrefix(path, cgroupfsPrefix) if !ok { // Non-standard path, old system? return openFallback(path, flags, mode) } fd, err := unix.Openat2(int(cgroupRootHandle.Fd()), relPath, &unix.OpenHow{ Resolve: resolveFlags, Flags: uint64(flags) | unix.O_CLOEXEC, Mode: uint64(mode), }) if err != nil { err = &os.PathError{Op: "openat2", Path: path, Err: err} // Check if cgroupRootHandle is still opened to cgroupfsDir // (happens when this package is incorrectly used // across the chroot/pivot_root/mntns boundary, or // when /sys/fs/cgroup is remounted). // // TODO: if such usage will ever be common, amend this // to reopen cgroupRootHandle and retry openat2. fdDest, fdErr := os.Readlink("/proc/thread-self/fd/" + strconv.Itoa(int(cgroupRootHandle.Fd()))) if fdErr == nil && fdDest != cgroupfsDir { // Wrap the error so it is clear that cgroupRootHandle // is opened to an unexpected/wrong directory. err = fmt.Errorf("cgroupRootHandle %d unexpectedly opened to %s != %s: %w", cgroupRootHandle.Fd(), fdDest, cgroupfsDir, err) } return nil, err } return os.NewFile(uintptr(fd), path), nil } var errNotCgroupfs = errors.New("not a cgroup file") // Can be changed by unit tests. var openFallback = openAndCheck // openAndCheck is used when openat2(2) is not available. It checks the opened // file is on cgroupfs, returning an error otherwise. func openAndCheck(path string, flags int, mode os.FileMode) (*os.File, error) { fd, err := os.OpenFile(path, flags, mode) if err != nil { return nil, err } if TestMode { return fd, nil } // Check this is a cgroupfs file. var st unix.Statfs_t if err := unix.Fstatfs(int(fd.Fd()), &st); err != nil { _ = fd.Close() return nil, &os.PathError{Op: "statfs", Path: path, Err: err} } if st.Type != unix.CGROUP_SUPER_MAGIC && st.Type != unix.CGROUP2_SUPER_MAGIC { _ = fd.Close() return nil, &os.PathError{Op: "open", Path: path, Err: errNotCgroupfs} } return fd, nil } cgroups-0.0.4/file_test.go000066400000000000000000000044221503527177300155160ustar00rootroot00000000000000package cgroups import ( "errors" "fmt" "os" "path/filepath" "strconv" "testing" "time" ) func TestWriteCgroupFileHandlesInterrupt(t *testing.T) { const ( memoryCgroupMount = "/sys/fs/cgroup/memory" memoryLimit = "memory.limit_in_bytes" ) if _, err := os.Stat(memoryCgroupMount); err != nil { // most probably cgroupv2 t.Skip(err) } cgroupName := fmt.Sprintf("test-eint-%d", time.Now().Nanosecond()) cgroupPath := filepath.Join(memoryCgroupMount, cgroupName) if err := os.MkdirAll(cgroupPath, 0o755); err != nil { t.Fatal(err) } defer os.RemoveAll(cgroupPath) if _, err := os.Stat(filepath.Join(cgroupPath, memoryLimit)); err != nil { // either cgroupv2, or memory controller is not available t.Skip(err) } for i := range 100000 { limit := 1024*1024 + i if err := WriteFile(cgroupPath, memoryLimit, strconv.Itoa(limit)); err != nil { t.Fatalf("Failed to write %d on attempt %d: %+v", limit, i, err) } } } func TestOpenat2(t *testing.T) { if !IsCgroup2UnifiedMode() { // The reason is many test cases below test opening files from // the top-level directory, where cgroup v1 has no files. t.Skip("test requires cgroup v2") } // Make sure we test openat2, not its fallback. openFallback = func(_ string, _ int, _ os.FileMode) (*os.File, error) { return nil, errors.New("fallback") } defer func() { openFallback = openAndCheck }() for _, tc := range []struct{ dir, file string }{ {"/sys/fs/cgroup", "cgroup.controllers"}, {"/sys/fs/cgroup", "/cgroup.controllers"}, {"/sys/fs/cgroup/", "cgroup.controllers"}, {"/sys/fs/cgroup/", "/cgroup.controllers"}, {"/", "/sys/fs/cgroup/cgroup.controllers"}, {"/", "sys/fs/cgroup/cgroup.controllers"}, {"/sys/fs/cgroup/cgroup.controllers", ""}, } { fd, err := OpenFile(tc.dir, tc.file, os.O_RDONLY) if err != nil { t.Errorf("case %+v: %v", tc, err) } fd.Close() } } func BenchmarkWriteFile(b *testing.B) { TestMode = true defer func() { TestMode = false }() dir := b.TempDir() tc := []string{ "one", "one\ntwo\nthree", "10:200 foo=bar boo=far\n300:1200 something=other\ndefault 45000\n", "\n\n\n\n\n\n\n\n", } b.ResetTimer() for i := 0; i < b.N; i++ { for _, val := range tc { if err := WriteFileByLine(dir, "file", val); err != nil { b.Fatal(err) } } } } cgroups-0.0.4/fs/000077500000000000000000000000001503527177300136175ustar00rootroot00000000000000cgroups-0.0.4/fs/blkio.go000066400000000000000000000173541503527177300152600ustar00rootroot00000000000000package fs import ( "bufio" "os" "path/filepath" "strconv" "strings" "github.com/opencontainers/cgroups" ) type BlkioGroup struct { weightFilename string weightDeviceFilename string } func (s *BlkioGroup) Name() string { return "blkio" } func (s *BlkioGroup) Apply(path string, _ *cgroups.Resources, pid int) error { return apply(path, pid) } func (s *BlkioGroup) Set(path string, r *cgroups.Resources) error { s.detectWeightFilenames(path) if r.BlkioWeight != 0 { if err := cgroups.WriteFile(path, s.weightFilename, strconv.FormatUint(uint64(r.BlkioWeight), 10)); err != nil { return err } } if r.BlkioLeafWeight != 0 { if err := cgroups.WriteFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(r.BlkioLeafWeight), 10)); err != nil { return err } } for _, wd := range r.BlkioWeightDevice { if wd.Weight != 0 { if err := cgroups.WriteFile(path, s.weightDeviceFilename, wd.WeightString()); err != nil { return err } } if wd.LeafWeight != 0 { if err := cgroups.WriteFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil { return err } } } for _, td := range r.BlkioThrottleReadBpsDevice { if err := cgroups.WriteFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil { return err } } for _, td := range r.BlkioThrottleWriteBpsDevice { if err := cgroups.WriteFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil { return err } } for _, td := range r.BlkioThrottleReadIOPSDevice { if err := cgroups.WriteFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil { return err } } for _, td := range r.BlkioThrottleWriteIOPSDevice { if err := cgroups.WriteFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil { return err } } return nil } /* examples: blkio.sectors 8:0 6792 blkio.io_service_bytes 8:0 Read 1282048 8:0 Write 2195456 8:0 Sync 2195456 8:0 Async 1282048 8:0 Total 3477504 Total 3477504 blkio.io_serviced 8:0 Read 124 8:0 Write 104 8:0 Sync 104 8:0 Async 124 8:0 Total 228 Total 228 blkio.io_queued 8:0 Read 0 8:0 Write 0 8:0 Sync 0 8:0 Async 0 8:0 Total 0 Total 0 */ func splitBlkioStatLine(r rune) bool { return r == ' ' || r == ':' } func getBlkioStat(dir, file string) ([]cgroups.BlkioStatEntry, error) { var blkioStats []cgroups.BlkioStatEntry f, err := cgroups.OpenFile(dir, file, os.O_RDONLY) if err != nil { if os.IsNotExist(err) { return blkioStats, nil } return nil, err } defer f.Close() sc := bufio.NewScanner(f) for sc.Scan() { // format: dev type amount fields := strings.FieldsFunc(sc.Text(), splitBlkioStatLine) if len(fields) < 3 { if len(fields) == 2 && fields[0] == "Total" { // skip total line continue } else { return nil, malformedLine(dir, file, sc.Text()) } } v, err := strconv.ParseUint(fields[0], 10, 64) if err != nil { return nil, &parseError{Path: dir, File: file, Err: err} } major := v v, err = strconv.ParseUint(fields[1], 10, 64) if err != nil { return nil, &parseError{Path: dir, File: file, Err: err} } minor := v op := "" valueField := 2 if len(fields) == 4 { op = fields[2] valueField = 3 } v, err = strconv.ParseUint(fields[valueField], 10, 64) if err != nil { return nil, &parseError{Path: dir, File: file, Err: err} } blkioStats = append(blkioStats, cgroups.BlkioStatEntry{Major: major, Minor: minor, Op: op, Value: v}) } if err := sc.Err(); err != nil { return nil, &parseError{Path: dir, File: file, Err: err} } return blkioStats, nil } func (s *BlkioGroup) GetStats(path string, stats *cgroups.Stats) error { type blkioStatInfo struct { filename string blkioStatEntriesPtr *[]cgroups.BlkioStatEntry } bfqDebugStats := []blkioStatInfo{ { filename: "blkio.bfq.sectors_recursive", blkioStatEntriesPtr: &stats.BlkioStats.SectorsRecursive, }, { filename: "blkio.bfq.io_service_time_recursive", blkioStatEntriesPtr: &stats.BlkioStats.IoServiceTimeRecursive, }, { filename: "blkio.bfq.io_wait_time_recursive", blkioStatEntriesPtr: &stats.BlkioStats.IoWaitTimeRecursive, }, { filename: "blkio.bfq.io_merged_recursive", blkioStatEntriesPtr: &stats.BlkioStats.IoMergedRecursive, }, { filename: "blkio.bfq.io_queued_recursive", blkioStatEntriesPtr: &stats.BlkioStats.IoQueuedRecursive, }, { filename: "blkio.bfq.time_recursive", blkioStatEntriesPtr: &stats.BlkioStats.IoTimeRecursive, }, { filename: "blkio.bfq.io_serviced_recursive", blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, }, { filename: "blkio.bfq.io_service_bytes_recursive", blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, }, } bfqStats := []blkioStatInfo{ { filename: "blkio.bfq.io_serviced_recursive", blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, }, { filename: "blkio.bfq.io_service_bytes_recursive", blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, }, } cfqStats := []blkioStatInfo{ { filename: "blkio.sectors_recursive", blkioStatEntriesPtr: &stats.BlkioStats.SectorsRecursive, }, { filename: "blkio.io_service_time_recursive", blkioStatEntriesPtr: &stats.BlkioStats.IoServiceTimeRecursive, }, { filename: "blkio.io_wait_time_recursive", blkioStatEntriesPtr: &stats.BlkioStats.IoWaitTimeRecursive, }, { filename: "blkio.io_merged_recursive", blkioStatEntriesPtr: &stats.BlkioStats.IoMergedRecursive, }, { filename: "blkio.io_queued_recursive", blkioStatEntriesPtr: &stats.BlkioStats.IoQueuedRecursive, }, { filename: "blkio.time_recursive", blkioStatEntriesPtr: &stats.BlkioStats.IoTimeRecursive, }, { filename: "blkio.io_serviced_recursive", blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, }, { filename: "blkio.io_service_bytes_recursive", blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, }, } throttleRecursiveStats := []blkioStatInfo{ { filename: "blkio.throttle.io_serviced_recursive", blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, }, { filename: "blkio.throttle.io_service_bytes_recursive", blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, }, } baseStats := []blkioStatInfo{ { filename: "blkio.throttle.io_serviced", blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, }, { filename: "blkio.throttle.io_service_bytes", blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, }, } orderedStats := [][]blkioStatInfo{ bfqDebugStats, bfqStats, cfqStats, throttleRecursiveStats, baseStats, } var blkioStats []cgroups.BlkioStatEntry var err error for _, statGroup := range orderedStats { for i, statInfo := range statGroup { if blkioStats, err = getBlkioStat(path, statInfo.filename); err != nil || blkioStats == nil { // if error occurs on first file, move to next group if i == 0 { break } return err } *statInfo.blkioStatEntriesPtr = blkioStats // finish if all stats are gathered if i == len(statGroup)-1 { return nil } } } return nil } func (s *BlkioGroup) detectWeightFilenames(path string) { if s.weightFilename != "" { // Already detected. return } if cgroups.PathExists(filepath.Join(path, "blkio.weight")) { s.weightFilename = "blkio.weight" s.weightDeviceFilename = "blkio.weight_device" } else { s.weightFilename = "blkio.bfq.weight" s.weightDeviceFilename = "blkio.bfq.weight_device" } } cgroups-0.0.4/fs/blkio_test.go000066400000000000000000000725721503527177300163220ustar00rootroot00000000000000package fs import ( "maps" "strconv" "testing" "github.com/opencontainers/cgroups" "github.com/opencontainers/cgroups/fscommon" ) const ( sectorsRecursiveContents = `8:0 1024` sectorsRecursiveContentsBFQ = `8:0 2048` serviceBytesRecursiveContents = `8:0 Read 100 8:0 Write 200 8:0 Sync 300 8:0 Async 500 8:0 Total 500 Total 500` serviceBytesRecursiveContentsBFQ = `8:0 Read 1100 8:0 Write 1200 8:0 Sync 1300 8:0 Async 1500 8:0 Total 1500 Total 1500` servicedRecursiveContents = `8:0 Read 10 8:0 Write 40 8:0 Sync 20 8:0 Async 30 8:0 Total 50 Total 50` servicedRecursiveContentsBFQ = `8:0 Read 11 8:0 Write 41 8:0 Sync 21 8:0 Async 31 8:0 Total 51 Total 51` queuedRecursiveContents = `8:0 Read 1 8:0 Write 4 8:0 Sync 2 8:0 Async 3 8:0 Total 5 Total 5` queuedRecursiveContentsBFQ = `8:0 Read 2 8:0 Write 3 8:0 Sync 4 8:0 Async 5 8:0 Total 6 Total 6` serviceTimeRecursiveContents = `8:0 Read 173959 8:0 Write 0 8:0 Sync 0 8:0 Async 173959 8:0 Total 17395 Total 17395` serviceTimeRecursiveContentsBFQ = `8:0 Read 173959 8:0 Write 0 8:0 Sync 0 8:0 Async 173 8:0 Total 174 Total 174` waitTimeRecursiveContents = `8:0 Read 15571 8:0 Write 0 8:0 Sync 0 8:0 Async 15571 8:0 Total 15571` waitTimeRecursiveContentsBFQ = `8:0 Read 1557 8:0 Write 0 8:0 Sync 0 8:0 Async 1557 8:0 Total 1557` mergedRecursiveContents = `8:0 Read 5 8:0 Write 10 8:0 Sync 0 8:0 Async 0 8:0 Total 15 Total 15` mergedRecursiveContentsBFQ = `8:0 Read 51 8:0 Write 101 8:0 Sync 0 8:0 Async 0 8:0 Total 151 Total 151` timeRecursiveContents = `8:0 8` timeRecursiveContentsBFQ = `8:0 16` throttleServiceBytes = `8:0 Read 11030528 8:0 Write 23 8:0 Sync 42 8:0 Async 11030528 8:0 Total 11030528 252:0 Read 11030528 252:0 Write 23 252:0 Sync 42 252:0 Async 11030528 252:0 Total 11030528 Total 22061056` throttleServiceBytesRecursive = `8:0 Read 110305281 8:0 Write 231 8:0 Sync 421 8:0 Async 110305281 8:0 Total 110305281 252:0 Read 110305281 252:0 Write 231 252:0 Sync 421 252:0 Async 110305281 252:0 Total 110305281 Total 220610561` throttleServiced = `8:0 Read 164 8:0 Write 23 8:0 Sync 42 8:0 Async 164 8:0 Total 164 252:0 Read 164 252:0 Write 23 252:0 Sync 42 252:0 Async 164 252:0 Total 164 Total 328` throttleServicedRecursive = `8:0 Read 1641 8:0 Write 231 8:0 Sync 421 8:0 Async 1641 8:0 Total 1641 252:0 Read 1641 252:0 Write 231 252:0 Sync 421 252:0 Async 1641 252:0 Total 1641 Total 3281` ) var blkioBFQDebugStatsTestFiles = map[string]string{ "blkio.bfq.io_service_bytes_recursive": serviceBytesRecursiveContentsBFQ, "blkio.bfq.io_serviced_recursive": servicedRecursiveContentsBFQ, "blkio.bfq.io_queued_recursive": queuedRecursiveContentsBFQ, "blkio.bfq.io_service_time_recursive": serviceTimeRecursiveContentsBFQ, "blkio.bfq.io_wait_time_recursive": waitTimeRecursiveContentsBFQ, "blkio.bfq.io_merged_recursive": mergedRecursiveContentsBFQ, "blkio.bfq.time_recursive": timeRecursiveContentsBFQ, "blkio.bfq.sectors_recursive": sectorsRecursiveContentsBFQ, } var blkioBFQStatsTestFiles = map[string]string{ "blkio.bfq.io_service_bytes_recursive": serviceBytesRecursiveContentsBFQ, "blkio.bfq.io_serviced_recursive": servicedRecursiveContentsBFQ, } var blkioCFQStatsTestFiles = map[string]string{ "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents, "blkio.io_serviced_recursive": servicedRecursiveContents, "blkio.io_queued_recursive": queuedRecursiveContents, "blkio.io_service_time_recursive": serviceTimeRecursiveContents, "blkio.io_wait_time_recursive": waitTimeRecursiveContents, "blkio.io_merged_recursive": mergedRecursiveContents, "blkio.time_recursive": timeRecursiveContents, "blkio.sectors_recursive": sectorsRecursiveContents, } type blkioStatFailureTestCase struct { desc string filename string } func appendBlkioStatEntry(blkioStatEntries *[]cgroups.BlkioStatEntry, major, minor, value uint64, op string) { //nolint:unparam *blkioStatEntries = append(*blkioStatEntries, cgroups.BlkioStatEntry{Major: major, Minor: minor, Value: value, Op: op}) } func TestBlkioSetWeight(t *testing.T) { const ( weightBefore = 100 weightAfter = 200 ) for _, legacyIOScheduler := range []bool{false, true} { // Populate cgroup path := tempDir(t, "blkio") weightFilename := "blkio.bfq.weight" if legacyIOScheduler { weightFilename = "blkio.weight" } writeFileContents(t, path, map[string]string{ weightFilename: strconv.Itoa(weightBefore), }) // Apply new configuration r := &cgroups.Resources{ BlkioWeight: weightAfter, } blkio := &BlkioGroup{} if err := blkio.Set(path, r); err != nil { t.Fatal(err) } // Verify results if weightFilename != blkio.weightFilename { t.Fatalf("weight filename detection failed: expected %q, detected %q", weightFilename, blkio.weightFilename) } value, err := fscommon.GetCgroupParamUint(path, weightFilename) if err != nil { t.Fatal(err) } if value != weightAfter { t.Fatalf("Got the wrong value, set %s failed.", weightFilename) } } } func TestBlkioSetWeightDevice(t *testing.T) { const ( weightDeviceBefore = "8:0 400" ) for _, legacyIOScheduler := range []bool{false, true} { // Populate cgroup path := tempDir(t, "blkio") weightFilename := "blkio.bfq.weight" weightDeviceFilename := "blkio.bfq.weight_device" if legacyIOScheduler { weightFilename = "blkio.weight" weightDeviceFilename = "blkio.weight_device" } writeFileContents(t, path, map[string]string{ weightFilename: "", weightDeviceFilename: weightDeviceBefore, }) // Apply new configuration wd := cgroups.NewWeightDevice(8, 0, 500, 0) weightDeviceAfter := wd.WeightString() r := &cgroups.Resources{ BlkioWeightDevice: []*cgroups.WeightDevice{wd}, } blkio := &BlkioGroup{} if err := blkio.Set(path, r); err != nil { t.Fatal(err) } // Verify results if weightDeviceFilename != blkio.weightDeviceFilename { t.Fatalf("weight_device filename detection failed: expected %q, detected %q", weightDeviceFilename, blkio.weightDeviceFilename) } value, err := fscommon.GetCgroupParamString(path, weightDeviceFilename) if err != nil { t.Fatal(err) } if value != weightDeviceAfter { t.Fatalf("Got the wrong value, set %s failed.", weightDeviceFilename) } } } // regression #274 func TestBlkioSetMultipleWeightDevice(t *testing.T) { path := tempDir(t, "blkio") const ( weightDeviceBefore = "8:0 400" ) wd1 := cgroups.NewWeightDevice(8, 0, 500, 0) wd2 := cgroups.NewWeightDevice(8, 16, 500, 0) // we cannot actually set and check both because normal os.WriteFile // when writing to cgroup file will overwrite the whole file content instead // of updating it as the kernel is doing. Just check the second device // is present will suffice for the test to ensure multiple writes are done. weightDeviceAfter := wd2.WeightString() blkio := &BlkioGroup{} blkio.detectWeightFilenames(path) if blkio.weightDeviceFilename != "blkio.bfq.weight_device" { t.Fatalf("when blkio controller is unavailable, expected to use \"blkio.bfq.weight_device\", tried to use %q", blkio.weightDeviceFilename) } writeFileContents(t, path, map[string]string{ blkio.weightDeviceFilename: weightDeviceBefore, }) r := &cgroups.Resources{ BlkioWeightDevice: []*cgroups.WeightDevice{wd1, wd2}, } if err := blkio.Set(path, r); err != nil { t.Fatal(err) } value, err := fscommon.GetCgroupParamString(path, blkio.weightDeviceFilename) if err != nil { t.Fatal(err) } if value != weightDeviceAfter { t.Fatalf("Got the wrong value, set %s failed.", blkio.weightDeviceFilename) } } func TestBlkioBFQDebugStats(t *testing.T) { path := tempDir(t, "blkio") writeFileContents(t, path, blkioBFQDebugStatsTestFiles) blkio := &BlkioGroup{} actualStats := *cgroups.NewStats() err := blkio.GetStats(path, &actualStats) if err != nil { t.Fatal(err) } expectedStats := cgroups.BlkioStats{} appendBlkioStatEntry(&expectedStats.SectorsRecursive, 8, 0, 2048, "") appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1100, "Read") appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1200, "Write") appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1300, "Sync") appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1500, "Async") appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1500, "Total") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 11, "Read") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 41, "Write") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 21, "Sync") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 31, "Async") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 51, "Total") appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 2, "Read") appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 3, "Write") appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 4, "Sync") appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 5, "Async") appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 6, "Total") appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173959, "Read") appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Write") appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Sync") appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173, "Async") appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 174, "Total") appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 1557, "Read") appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Write") appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Sync") appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 1557, "Async") appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 1557, "Total") appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 51, "Read") appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 101, "Write") appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Sync") appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Async") appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 151, "Total") appendBlkioStatEntry(&expectedStats.IoTimeRecursive, 8, 0, 16, "") expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats) } func TestBlkioMultipleStatsFiles(t *testing.T) { path := tempDir(t, "blkio") writeFileContents(t, path, blkioBFQDebugStatsTestFiles) writeFileContents(t, path, blkioCFQStatsTestFiles) blkio := &BlkioGroup{} actualStats := *cgroups.NewStats() err := blkio.GetStats(path, &actualStats) if err != nil { t.Fatal(err) } expectedStats := cgroups.BlkioStats{} appendBlkioStatEntry(&expectedStats.SectorsRecursive, 8, 0, 2048, "") appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1100, "Read") appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1200, "Write") appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1300, "Sync") appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1500, "Async") appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1500, "Total") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 11, "Read") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 41, "Write") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 21, "Sync") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 31, "Async") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 51, "Total") appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 2, "Read") appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 3, "Write") appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 4, "Sync") appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 5, "Async") appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 6, "Total") appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173959, "Read") appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Write") appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Sync") appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173, "Async") appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 174, "Total") appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 1557, "Read") appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Write") appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Sync") appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 1557, "Async") appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 1557, "Total") appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 51, "Read") appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 101, "Write") appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Sync") appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Async") appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 151, "Total") appendBlkioStatEntry(&expectedStats.IoTimeRecursive, 8, 0, 16, "") expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats) } func TestBlkioBFQStats(t *testing.T) { path := tempDir(t, "blkio") writeFileContents(t, path, blkioBFQStatsTestFiles) blkio := &BlkioGroup{} actualStats := *cgroups.NewStats() err := blkio.GetStats(path, &actualStats) if err != nil { t.Fatal(err) } expectedStats := cgroups.BlkioStats{} appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1100, "Read") appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1200, "Write") appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1300, "Sync") appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1500, "Async") appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1500, "Total") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 11, "Read") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 41, "Write") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 21, "Sync") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 31, "Async") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 51, "Total") expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats) } func TestBlkioStatsNoFilesBFQDebug(t *testing.T) { if testing.Short() { t.Skip("skipping test in short mode.") } testCases := []blkioStatFailureTestCase{ { desc: "missing blkio.bfq.io_service_bytes_recursive file", filename: "blkio.bfq.io_service_bytes_recursive", }, { desc: "missing blkio.bfq.io_serviced_recursive file", filename: "blkio.bfq.io_serviced_recursive", }, { desc: "missing blkio.bfq.io_queued_recursive file", filename: "blkio.bfq.io_queued_recursive", }, { desc: "missing blkio.bfq.sectors_recursive file", filename: "blkio.bfq.sectors_recursive", }, { desc: "missing blkio.bfq.io_service_time_recursive file", filename: "blkio.bfq.io_service_time_recursive", }, { desc: "missing blkio.bfq.io_wait_time_recursive file", filename: "blkio.bfq.io_wait_time_recursive", }, { desc: "missing blkio.bfq.io_merged_recursive file", filename: "blkio.bfq.io_merged_recursive", }, { desc: "missing blkio.bfq.time_recursive file", filename: "blkio.bfq.time_recursive", }, } for _, testCase := range testCases { path := tempDir(t, "cpuset") tempBlkioTestFiles := maps.Clone(blkioBFQDebugStatsTestFiles) delete(tempBlkioTestFiles, testCase.filename) writeFileContents(t, path, tempBlkioTestFiles) cpuset := &CpusetGroup{} actualStats := *cgroups.NewStats() err := cpuset.GetStats(path, &actualStats) if err != nil { t.Errorf("%s: want no error, got: %+v", testCase.desc, err) } } } func TestBlkioCFQStats(t *testing.T) { path := tempDir(t, "blkio") writeFileContents(t, path, blkioCFQStatsTestFiles) blkio := &BlkioGroup{} actualStats := *cgroups.NewStats() err := blkio.GetStats(path, &actualStats) if err != nil { t.Fatal(err) } // Verify expected stats. expectedStats := cgroups.BlkioStats{} appendBlkioStatEntry(&expectedStats.SectorsRecursive, 8, 0, 1024, "") appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 100, "Read") appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 200, "Write") appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 300, "Sync") appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 500, "Async") appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 500, "Total") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 10, "Read") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 40, "Write") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 20, "Sync") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 30, "Async") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 50, "Total") appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 1, "Read") appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 4, "Write") appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 2, "Sync") appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 3, "Async") appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 5, "Total") appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173959, "Read") appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Write") appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Sync") appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173959, "Async") appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 17395, "Total") appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 15571, "Read") appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Write") appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Sync") appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 15571, "Async") appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 15571, "Total") appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 5, "Read") appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 10, "Write") appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Sync") appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Async") appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 15, "Total") appendBlkioStatEntry(&expectedStats.IoTimeRecursive, 8, 0, 8, "") expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats) } func TestBlkioStatsNoFilesCFQ(t *testing.T) { if testing.Short() { t.Skip("skipping test in short mode.") } testCases := []blkioStatFailureTestCase{ { desc: "missing blkio.io_service_bytes_recursive file", filename: "blkio.io_service_bytes_recursive", }, { desc: "missing blkio.io_serviced_recursive file", filename: "blkio.io_serviced_recursive", }, { desc: "missing blkio.io_queued_recursive file", filename: "blkio.io_queued_recursive", }, { desc: "missing blkio.sectors_recursive file", filename: "blkio.sectors_recursive", }, { desc: "missing blkio.io_service_time_recursive file", filename: "blkio.io_service_time_recursive", }, { desc: "missing blkio.io_wait_time_recursive file", filename: "blkio.io_wait_time_recursive", }, { desc: "missing blkio.io_merged_recursive file", filename: "blkio.io_merged_recursive", }, { desc: "missing blkio.time_recursive file", filename: "blkio.time_recursive", }, } for _, testCase := range testCases { path := tempDir(t, "cpuset") tempBlkioTestFiles := maps.Clone(blkioCFQStatsTestFiles) delete(tempBlkioTestFiles, testCase.filename) writeFileContents(t, path, tempBlkioTestFiles) cpuset := &CpusetGroup{} actualStats := *cgroups.NewStats() err := cpuset.GetStats(path, &actualStats) if err != nil { t.Errorf("%s: want no error, got %+v", testCase.desc, err) } } } func TestBlkioStatsUnexpectedNumberOfFields(t *testing.T) { path := tempDir(t, "blkio") writeFileContents(t, path, map[string]string{ "blkio.io_service_bytes_recursive": "8:0 Read 100 100", "blkio.io_serviced_recursive": servicedRecursiveContents, "blkio.io_queued_recursive": queuedRecursiveContents, "blkio.sectors_recursive": sectorsRecursiveContents, "blkio.io_service_time_recursive": serviceTimeRecursiveContents, "blkio.io_wait_time_recursive": waitTimeRecursiveContents, "blkio.io_merged_recursive": mergedRecursiveContents, "blkio.time_recursive": timeRecursiveContents, }) blkio := &BlkioGroup{} actualStats := *cgroups.NewStats() err := blkio.GetStats(path, &actualStats) if err == nil { t.Fatal("Expected to fail, but did not") } } func TestBlkioStatsUnexpectedFieldType(t *testing.T) { path := tempDir(t, "blkio") writeFileContents(t, path, map[string]string{ "blkio.io_service_bytes_recursive": "8:0 Read Write", "blkio.io_serviced_recursive": servicedRecursiveContents, "blkio.io_queued_recursive": queuedRecursiveContents, "blkio.sectors_recursive": sectorsRecursiveContents, "blkio.io_service_time_recursive": serviceTimeRecursiveContents, "blkio.io_wait_time_recursive": waitTimeRecursiveContents, "blkio.io_merged_recursive": mergedRecursiveContents, "blkio.time_recursive": timeRecursiveContents, }) blkio := &BlkioGroup{} actualStats := *cgroups.NewStats() err := blkio.GetStats(path, &actualStats) if err == nil { t.Fatal("Expected to fail, but did not") } } func TestThrottleRecursiveBlkioStats(t *testing.T) { path := tempDir(t, "blkio") writeFileContents(t, path, map[string]string{ "blkio.io_service_bytes_recursive": "", "blkio.io_serviced_recursive": "", "blkio.io_queued_recursive": "", "blkio.sectors_recursive": "", "blkio.io_service_time_recursive": "", "blkio.io_wait_time_recursive": "", "blkio.io_merged_recursive": "", "blkio.time_recursive": "", "blkio.throttle.io_service_bytes_recursive": throttleServiceBytesRecursive, "blkio.throttle.io_serviced_recursive": throttleServicedRecursive, }) blkio := &BlkioGroup{} actualStats := *cgroups.NewStats() err := blkio.GetStats(path, &actualStats) if err != nil { t.Fatal(err) } // Verify expected stats. expectedStats := cgroups.BlkioStats{} appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 110305281, "Read") appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 231, "Write") appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 421, "Sync") appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 110305281, "Async") appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 110305281, "Total") appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 110305281, "Read") appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 231, "Write") appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 421, "Sync") appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 110305281, "Async") appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 110305281, "Total") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 1641, "Read") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 231, "Write") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 421, "Sync") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 1641, "Async") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 1641, "Total") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 1641, "Read") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 231, "Write") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 421, "Sync") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 1641, "Async") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 1641, "Total") expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats) } func TestThrottleBlkioStats(t *testing.T) { path := tempDir(t, "blkio") writeFileContents(t, path, map[string]string{ "blkio.io_service_bytes_recursive": "", "blkio.io_serviced_recursive": "", "blkio.io_queued_recursive": "", "blkio.sectors_recursive": "", "blkio.io_service_time_recursive": "", "blkio.io_wait_time_recursive": "", "blkio.io_merged_recursive": "", "blkio.time_recursive": "", "blkio.throttle.io_service_bytes": throttleServiceBytes, "blkio.throttle.io_serviced": throttleServiced, }) blkio := &BlkioGroup{} actualStats := *cgroups.NewStats() err := blkio.GetStats(path, &actualStats) if err != nil { t.Fatal(err) } // Verify expected stats. expectedStats := cgroups.BlkioStats{} appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 11030528, "Read") appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 23, "Write") appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 42, "Sync") appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 11030528, "Async") appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 11030528, "Total") appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 11030528, "Read") appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 23, "Write") appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 42, "Sync") appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 11030528, "Async") appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 11030528, "Total") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 164, "Read") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 23, "Write") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 42, "Sync") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 164, "Async") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 164, "Total") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 164, "Read") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 23, "Write") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 42, "Sync") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 164, "Async") appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 164, "Total") expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats) } func TestBlkioSetThrottleReadBpsDevice(t *testing.T) { path := tempDir(t, "blkio") const ( throttleBefore = `8:0 1024` ) td := cgroups.NewThrottleDevice(8, 0, 2048) throttleAfter := td.String() writeFileContents(t, path, map[string]string{ "blkio.throttle.read_bps_device": throttleBefore, }) r := &cgroups.Resources{ BlkioThrottleReadBpsDevice: []*cgroups.ThrottleDevice{td}, } blkio := &BlkioGroup{} if err := blkio.Set(path, r); err != nil { t.Fatal(err) } value, err := fscommon.GetCgroupParamString(path, "blkio.throttle.read_bps_device") if err != nil { t.Fatal(err) } if value != throttleAfter { t.Fatal("Got the wrong value, set blkio.throttle.read_bps_device failed.") } } func TestBlkioSetThrottleWriteBpsDevice(t *testing.T) { path := tempDir(t, "blkio") const ( throttleBefore = `8:0 1024` ) td := cgroups.NewThrottleDevice(8, 0, 2048) throttleAfter := td.String() writeFileContents(t, path, map[string]string{ "blkio.throttle.write_bps_device": throttleBefore, }) r := &cgroups.Resources{ BlkioThrottleWriteBpsDevice: []*cgroups.ThrottleDevice{td}, } blkio := &BlkioGroup{} if err := blkio.Set(path, r); err != nil { t.Fatal(err) } value, err := fscommon.GetCgroupParamString(path, "blkio.throttle.write_bps_device") if err != nil { t.Fatal(err) } if value != throttleAfter { t.Fatal("Got the wrong value, set blkio.throttle.write_bps_device failed.") } } func TestBlkioSetThrottleReadIOpsDevice(t *testing.T) { path := tempDir(t, "blkio") const ( throttleBefore = `8:0 1024` ) td := cgroups.NewThrottleDevice(8, 0, 2048) throttleAfter := td.String() writeFileContents(t, path, map[string]string{ "blkio.throttle.read_iops_device": throttleBefore, }) r := &cgroups.Resources{ BlkioThrottleReadIOPSDevice: []*cgroups.ThrottleDevice{td}, } blkio := &BlkioGroup{} if err := blkio.Set(path, r); err != nil { t.Fatal(err) } value, err := fscommon.GetCgroupParamString(path, "blkio.throttle.read_iops_device") if err != nil { t.Fatal(err) } if value != throttleAfter { t.Fatal("Got the wrong value, set blkio.throttle.read_iops_device failed.") } } func TestBlkioSetThrottleWriteIOpsDevice(t *testing.T) { path := tempDir(t, "blkio") const ( throttleBefore = `8:0 1024` ) td := cgroups.NewThrottleDevice(8, 0, 2048) throttleAfter := td.String() writeFileContents(t, path, map[string]string{ "blkio.throttle.write_iops_device": throttleBefore, }) r := &cgroups.Resources{ BlkioThrottleWriteIOPSDevice: []*cgroups.ThrottleDevice{td}, } blkio := &BlkioGroup{} if err := blkio.Set(path, r); err != nil { t.Fatal(err) } value, err := fscommon.GetCgroupParamString(path, "blkio.throttle.write_iops_device") if err != nil { t.Fatal(err) } if value != throttleAfter { t.Fatal("Got the wrong value, set blkio.throttle.write_iops_device failed.") } } cgroups-0.0.4/fs/cpu.go000066400000000000000000000114461503527177300147430ustar00rootroot00000000000000package fs import ( "bufio" "errors" "fmt" "os" "strconv" "github.com/opencontainers/cgroups" "github.com/opencontainers/cgroups/fscommon" "golang.org/x/sys/unix" ) type CpuGroup struct{} func (s *CpuGroup) Name() string { return "cpu" } func (s *CpuGroup) Apply(path string, r *cgroups.Resources, pid int) error { if err := os.MkdirAll(path, 0o755); err != nil { return err } // We should set the real-Time group scheduling settings before moving // in the process because if the process is already in SCHED_RR mode // and no RT bandwidth is set, adding it will fail. if err := s.SetRtSched(path, r); err != nil { return err } // Since we are not using apply(), we need to place the pid // into the procs file. return cgroups.WriteCgroupProc(path, pid) } func (s *CpuGroup) SetRtSched(path string, r *cgroups.Resources) error { var period string if r.CpuRtPeriod != 0 { period = strconv.FormatUint(r.CpuRtPeriod, 10) if err := cgroups.WriteFile(path, "cpu.rt_period_us", period); err != nil { // The values of cpu.rt_period_us and cpu.rt_runtime_us // are inter-dependent and need to be set in a proper order. // If the kernel rejects the new period value with EINVAL // and the new runtime value is also being set, let's // ignore the error for now and retry later. if !errors.Is(err, unix.EINVAL) || r.CpuRtRuntime == 0 { return err } } else { period = "" } } if r.CpuRtRuntime != 0 { if err := cgroups.WriteFile(path, "cpu.rt_runtime_us", strconv.FormatInt(r.CpuRtRuntime, 10)); err != nil { return err } if period != "" { if err := cgroups.WriteFile(path, "cpu.rt_period_us", period); err != nil { return err } } } return nil } func (s *CpuGroup) Set(path string, r *cgroups.Resources) error { if r.CpuShares != 0 { shares := r.CpuShares if err := cgroups.WriteFile(path, "cpu.shares", strconv.FormatUint(shares, 10)); err != nil { return err } // read it back sharesRead, err := fscommon.GetCgroupParamUint(path, "cpu.shares") if err != nil { return err } // ... and check if shares > sharesRead { return fmt.Errorf("the maximum allowed cpu-shares is %d", sharesRead) } else if shares < sharesRead { return fmt.Errorf("the minimum allowed cpu-shares is %d", sharesRead) } } var period string if r.CpuPeriod != 0 { period = strconv.FormatUint(r.CpuPeriod, 10) if err := cgroups.WriteFile(path, "cpu.cfs_period_us", period); err != nil { // Sometimes when the period to be set is smaller // than the current one, it is rejected by the kernel // (EINVAL) as old_quota/new_period exceeds the parent // cgroup quota limit. If this happens and the quota is // going to be set, ignore the error for now and retry // after setting the quota. if !errors.Is(err, unix.EINVAL) || r.CpuQuota == 0 { return err } } else { period = "" } } var burst string if r.CpuBurst != nil { burst = strconv.FormatUint(*r.CpuBurst, 10) if err := cgroups.WriteFile(path, "cpu.cfs_burst_us", burst); err != nil { if errors.Is(err, unix.ENOENT) { // If CPU burst knob is not available (e.g. // older kernel), ignore it. burst = "" } else { // Sometimes when the burst to be set is larger // than the current one, it is rejected by the kernel // (EINVAL) as old_quota/new_burst exceeds the parent // cgroup quota limit. If this happens and the quota is // going to be set, ignore the error for now and retry // after setting the quota. if !errors.Is(err, unix.EINVAL) || r.CpuQuota == 0 { return err } } } else { burst = "" } } if r.CpuQuota != 0 { if err := cgroups.WriteFile(path, "cpu.cfs_quota_us", strconv.FormatInt(r.CpuQuota, 10)); err != nil { return err } if period != "" { if err := cgroups.WriteFile(path, "cpu.cfs_period_us", period); err != nil { return err } } if burst != "" { if err := cgroups.WriteFile(path, "cpu.cfs_burst_us", burst); err != nil { return err } } } if r.CPUIdle != nil { idle := strconv.FormatInt(*r.CPUIdle, 10) if err := cgroups.WriteFile(path, "cpu.idle", idle); err != nil { return err } } return s.SetRtSched(path, r) } func (s *CpuGroup) GetStats(path string, stats *cgroups.Stats) error { const file = "cpu.stat" f, err := cgroups.OpenFile(path, file, os.O_RDONLY) if err != nil { if os.IsNotExist(err) { return nil } return err } defer f.Close() sc := bufio.NewScanner(f) for sc.Scan() { t, v, err := fscommon.ParseKeyValue(sc.Text()) if err != nil { return &parseError{Path: path, File: file, Err: err} } switch t { case "nr_periods": stats.CpuStats.ThrottlingData.Periods = v case "nr_throttled": stats.CpuStats.ThrottlingData.ThrottledPeriods = v case "throttled_time": stats.CpuStats.ThrottlingData.ThrottledTime = v } } return nil } cgroups-0.0.4/fs/cpu_test.go000066400000000000000000000117321503527177300160000ustar00rootroot00000000000000package fs import ( "fmt" "strconv" "testing" "github.com/opencontainers/cgroups" "github.com/opencontainers/cgroups/fscommon" ) func TestCpuSetShares(t *testing.T) { path := tempDir(t, "cpu") const ( sharesBefore = 1024 sharesAfter = 512 ) writeFileContents(t, path, map[string]string{ "cpu.shares": strconv.Itoa(sharesBefore), }) r := &cgroups.Resources{ CpuShares: sharesAfter, } cpu := &CpuGroup{} if err := cpu.Set(path, r); err != nil { t.Fatal(err) } value, err := fscommon.GetCgroupParamUint(path, "cpu.shares") if err != nil { t.Fatal(err) } if value != sharesAfter { t.Fatal("Got the wrong value, set cpu.shares failed.") } } func TestCpuSetBandWidth(t *testing.T) { path := tempDir(t, "cpu") const ( quotaBefore = 8000 quotaAfter = 5000 burstBefore = 2000 periodBefore = 10000 periodAfter = 7000 rtRuntimeBefore = 8000 rtRuntimeAfter = 5000 rtPeriodBefore = 10000 rtPeriodAfter = 7000 ) burstAfter := uint64(1000) writeFileContents(t, path, map[string]string{ "cpu.cfs_quota_us": strconv.Itoa(quotaBefore), "cpu.cfs_burst_us": strconv.Itoa(burstBefore), "cpu.cfs_period_us": strconv.Itoa(periodBefore), "cpu.rt_runtime_us": strconv.Itoa(rtRuntimeBefore), "cpu.rt_period_us": strconv.Itoa(rtPeriodBefore), }) r := &cgroups.Resources{ CpuQuota: quotaAfter, CpuBurst: &burstAfter, CpuPeriod: periodAfter, CpuRtRuntime: rtRuntimeAfter, CpuRtPeriod: rtPeriodAfter, } cpu := &CpuGroup{} if err := cpu.Set(path, r); err != nil { t.Fatal(err) } quota, err := fscommon.GetCgroupParamUint(path, "cpu.cfs_quota_us") if err != nil { t.Fatal(err) } if quota != quotaAfter { t.Fatal("Got the wrong value, set cpu.cfs_quota_us failed.") } burst, err := fscommon.GetCgroupParamUint(path, "cpu.cfs_burst_us") if err != nil { t.Fatal(err) } if burst != burstAfter { t.Fatal("Got the wrong value, set cpu.cfs_burst_us failed.") } period, err := fscommon.GetCgroupParamUint(path, "cpu.cfs_period_us") if err != nil { t.Fatal(err) } if period != periodAfter { t.Fatal("Got the wrong value, set cpu.cfs_period_us failed.") } rtRuntime, err := fscommon.GetCgroupParamUint(path, "cpu.rt_runtime_us") if err != nil { t.Fatal(err) } if rtRuntime != rtRuntimeAfter { t.Fatal("Got the wrong value, set cpu.rt_runtime_us failed.") } rtPeriod, err := fscommon.GetCgroupParamUint(path, "cpu.rt_period_us") if err != nil { t.Fatal(err) } if rtPeriod != rtPeriodAfter { t.Fatal("Got the wrong value, set cpu.rt_period_us failed.") } } func TestCpuStats(t *testing.T) { path := tempDir(t, "cpu") const ( nrPeriods = 2000 nrThrottled = 200 throttledTime = uint64(18446744073709551615) ) cpuStatContent := fmt.Sprintf("nr_periods %d\nnr_throttled %d\nthrottled_time %d\n", nrPeriods, nrThrottled, throttledTime) writeFileContents(t, path, map[string]string{ "cpu.stat": cpuStatContent, }) cpu := &CpuGroup{} actualStats := *cgroups.NewStats() err := cpu.GetStats(path, &actualStats) if err != nil { t.Fatal(err) } expectedStats := cgroups.ThrottlingData{ Periods: nrPeriods, ThrottledPeriods: nrThrottled, ThrottledTime: throttledTime, } expectThrottlingDataEquals(t, expectedStats, actualStats.CpuStats.ThrottlingData) } func TestNoCpuStatFile(t *testing.T) { path := tempDir(t, "cpu") cpu := &CpuGroup{} actualStats := *cgroups.NewStats() err := cpu.GetStats(path, &actualStats) if err != nil { t.Fatal("Expected not to fail, but did") } } func TestInvalidCpuStat(t *testing.T) { path := tempDir(t, "cpu") cpuStatContent := `nr_periods 2000 nr_throttled 200 throttled_time fortytwo` writeFileContents(t, path, map[string]string{ "cpu.stat": cpuStatContent, }) cpu := &CpuGroup{} actualStats := *cgroups.NewStats() err := cpu.GetStats(path, &actualStats) if err == nil { t.Fatal("Expected failed stat parsing.") } } func TestCpuSetRtSchedAtApply(t *testing.T) { path := tempDir(t, "cpu") const ( rtRuntimeBefore = 0 rtRuntimeAfter = 5000 rtPeriodBefore = 0 rtPeriodAfter = 7000 ) writeFileContents(t, path, map[string]string{ "cpu.rt_runtime_us": strconv.Itoa(rtRuntimeBefore), "cpu.rt_period_us": strconv.Itoa(rtPeriodBefore), }) r := &cgroups.Resources{ CpuRtRuntime: rtRuntimeAfter, CpuRtPeriod: rtPeriodAfter, } cpu := &CpuGroup{} if err := cpu.Apply(path, r, 1234); err != nil { t.Fatal(err) } rtRuntime, err := fscommon.GetCgroupParamUint(path, "cpu.rt_runtime_us") if err != nil { t.Fatal(err) } if rtRuntime != rtRuntimeAfter { t.Fatal("Got the wrong value, set cpu.rt_runtime_us failed.") } rtPeriod, err := fscommon.GetCgroupParamUint(path, "cpu.rt_period_us") if err != nil { t.Fatal(err) } if rtPeriod != rtPeriodAfter { t.Fatal("Got the wrong value, set cpu.rt_period_us failed.") } pid, err := fscommon.GetCgroupParamUint(path, "cgroup.procs") if err != nil { t.Fatal(err) } if pid != 1234 { t.Fatal("Got the wrong value, set cgroup.procs failed.") } } cgroups-0.0.4/fs/cpuacct.go000066400000000000000000000100631503527177300155700ustar00rootroot00000000000000package fs import ( "bufio" "os" "strconv" "strings" "github.com/opencontainers/cgroups" "github.com/opencontainers/cgroups/fscommon" ) const ( nsInSec = 1000000000 // The value comes from `C.sysconf(C._SC_CLK_TCK)`, and // on Linux it's a constant which is safe to be hard coded, // so we can avoid using cgo here. For details, see: // https://github.com/containerd/cgroups/pull/12 clockTicks uint64 = 100 ) type CpuacctGroup struct{} func (s *CpuacctGroup) Name() string { return "cpuacct" } func (s *CpuacctGroup) Apply(path string, _ *cgroups.Resources, pid int) error { return apply(path, pid) } func (s *CpuacctGroup) Set(_ string, _ *cgroups.Resources) error { return nil } func (s *CpuacctGroup) GetStats(path string, stats *cgroups.Stats) error { if !cgroups.PathExists(path) { return nil } userModeUsage, kernelModeUsage, err := getCpuUsageBreakdown(path) if err != nil { return err } totalUsage, err := fscommon.GetCgroupParamUint(path, "cpuacct.usage") if err != nil { return err } percpuUsage, err := getPercpuUsage(path) if err != nil { return err } percpuUsageInKernelmode, percpuUsageInUsermode, err := getPercpuUsageInModes(path) if err != nil { return err } stats.CpuStats.CpuUsage.TotalUsage = totalUsage stats.CpuStats.CpuUsage.PercpuUsage = percpuUsage stats.CpuStats.CpuUsage.PercpuUsageInKernelmode = percpuUsageInKernelmode stats.CpuStats.CpuUsage.PercpuUsageInUsermode = percpuUsageInUsermode stats.CpuStats.CpuUsage.UsageInUsermode = userModeUsage stats.CpuStats.CpuUsage.UsageInKernelmode = kernelModeUsage return nil } // Returns user and kernel usage breakdown in nanoseconds. func getCpuUsageBreakdown(path string) (uint64, uint64, error) { var userModeUsage, kernelModeUsage uint64 const ( userField = "user" systemField = "system" file = "cpuacct.stat" ) // Expected format: // user // system data, err := cgroups.ReadFile(path, file) if err != nil { return 0, 0, err } fields := strings.Fields(data) if len(fields) < 4 || fields[0] != userField || fields[2] != systemField { return 0, 0, malformedLine(path, file, data) } if userModeUsage, err = strconv.ParseUint(fields[1], 10, 64); err != nil { return 0, 0, &parseError{Path: path, File: file, Err: err} } if kernelModeUsage, err = strconv.ParseUint(fields[3], 10, 64); err != nil { return 0, 0, &parseError{Path: path, File: file, Err: err} } return (userModeUsage * nsInSec) / clockTicks, (kernelModeUsage * nsInSec) / clockTicks, nil } func getPercpuUsage(path string) ([]uint64, error) { const file = "cpuacct.usage_percpu" percpuUsage := []uint64{} data, err := cgroups.ReadFile(path, file) if err != nil { return percpuUsage, err } for _, value := range strings.Fields(data) { value, err := strconv.ParseUint(value, 10, 64) if err != nil { return percpuUsage, &parseError{Path: path, File: file, Err: err} } percpuUsage = append(percpuUsage, value) } return percpuUsage, nil } func getPercpuUsageInModes(path string) ([]uint64, []uint64, error) { usageKernelMode := []uint64{} usageUserMode := []uint64{} const file = "cpuacct.usage_all" fd, err := cgroups.OpenFile(path, file, os.O_RDONLY) if os.IsNotExist(err) { return usageKernelMode, usageUserMode, nil } else if err != nil { return nil, nil, err } defer fd.Close() scanner := bufio.NewScanner(fd) scanner.Scan() // skipping header line for scanner.Scan() { // Each line is: cpu user system fields := strings.SplitN(scanner.Text(), " ", 3) if len(fields) != 3 { continue } user, err := strconv.ParseUint(fields[1], 10, 64) if err != nil { return nil, nil, &parseError{Path: path, File: file, Err: err} } usageUserMode = append(usageUserMode, user) kernel, err := strconv.ParseUint(fields[2], 10, 64) if err != nil { return nil, nil, &parseError{Path: path, File: file, Err: err} } usageKernelMode = append(usageKernelMode, kernel) } if err := scanner.Err(); err != nil { return nil, nil, &parseError{Path: path, File: file, Err: err} } return usageKernelMode, usageUserMode, nil } cgroups-0.0.4/fs/cpuacct_test.go000066400000000000000000000066501503527177300166360ustar00rootroot00000000000000package fs import ( "reflect" "testing" "github.com/opencontainers/cgroups" ) const ( cpuAcctUsageContents = "12262454190222160" cpuAcctUsagePerCPUContents = "1564936537989058 1583937096487821 1604195415465681 1596445226820187 1481069084155629 1478735613864327 1477610593414743 1476362015778086" cpuAcctStatContents = "user 452278264\nsystem 291429664" cpuAcctUsageAll = `cpu user system 0 962250696038415 637727786389114 1 981956408513304 638197595421064 2 1002658817529022 638956774598358 3 994937703492523 637985531181620 4 874843781648690 638837766495476 5 872544369885276 638763309884944 6 870104915696359 640081778921247 7 870202363887496 638716766259495 ` ) func TestCpuacctStats(t *testing.T) { path := tempDir(t, "cpuacct") writeFileContents(t, path, map[string]string{ "cpuacct.usage": cpuAcctUsageContents, "cpuacct.usage_percpu": cpuAcctUsagePerCPUContents, "cpuacct.stat": cpuAcctStatContents, "cpuacct.usage_all": cpuAcctUsageAll, }) cpuacct := &CpuacctGroup{} actualStats := *cgroups.NewStats() err := cpuacct.GetStats(path, &actualStats) if err != nil { t.Fatal(err) } expectedStats := cgroups.CpuUsage{ TotalUsage: uint64(12262454190222160), PercpuUsage: []uint64{ 1564936537989058, 1583937096487821, 1604195415465681, 1596445226820187, 1481069084155629, 1478735613864327, 1477610593414743, 1476362015778086, }, PercpuUsageInKernelmode: []uint64{ 637727786389114, 638197595421064, 638956774598358, 637985531181620, 638837766495476, 638763309884944, 640081778921247, 638716766259495, }, PercpuUsageInUsermode: []uint64{ 962250696038415, 981956408513304, 1002658817529022, 994937703492523, 874843781648690, 872544369885276, 870104915696359, 870202363887496, }, UsageInKernelmode: (uint64(291429664) * nsInSec) / clockTicks, UsageInUsermode: (uint64(452278264) * nsInSec) / clockTicks, } if !reflect.DeepEqual(expectedStats, actualStats.CpuStats.CpuUsage) { t.Errorf("Expected CPU usage %#v but found %#v\n", expectedStats, actualStats.CpuStats.CpuUsage) } } func TestCpuacctStatsWithoutUsageAll(t *testing.T) { path := tempDir(t, "cpuacct") writeFileContents(t, path, map[string]string{ "cpuacct.usage": cpuAcctUsageContents, "cpuacct.usage_percpu": cpuAcctUsagePerCPUContents, "cpuacct.stat": cpuAcctStatContents, }) cpuacct := &CpuacctGroup{} actualStats := *cgroups.NewStats() err := cpuacct.GetStats(path, &actualStats) if err != nil { t.Fatal(err) } expectedStats := cgroups.CpuUsage{ TotalUsage: uint64(12262454190222160), PercpuUsage: []uint64{ 1564936537989058, 1583937096487821, 1604195415465681, 1596445226820187, 1481069084155629, 1478735613864327, 1477610593414743, 1476362015778086, }, PercpuUsageInKernelmode: []uint64{}, PercpuUsageInUsermode: []uint64{}, UsageInKernelmode: (uint64(291429664) * nsInSec) / clockTicks, UsageInUsermode: (uint64(452278264) * nsInSec) / clockTicks, } if !reflect.DeepEqual(expectedStats, actualStats.CpuStats.CpuUsage) { t.Errorf("Expected CPU usage %#v but found %#v\n", expectedStats, actualStats.CpuStats.CpuUsage) } } func BenchmarkGetCpuUsageBreakdown(b *testing.B) { path := tempDir(b, "cpuacct") writeFileContents(b, path, map[string]string{ "cpuacct.stat": cpuAcctStatContents, }) b.ResetTimer() for i := 0; i < b.N; i++ { _, _, err := getCpuUsageBreakdown(path) if err != nil { b.Fatal(err) } } } cgroups-0.0.4/fs/cpuset.go000066400000000000000000000177131503527177300154620ustar00rootroot00000000000000package fs import ( "errors" "os" "path/filepath" "strconv" "strings" "sync" "golang.org/x/sys/unix" "github.com/opencontainers/cgroups" "github.com/opencontainers/cgroups/fscommon" ) var ( cpusetLock sync.Mutex cpusetPrefix = "cpuset." cpusetFastPath bool ) func cpusetFile(path string, name string) string { cpusetLock.Lock() defer cpusetLock.Unlock() // Only the v1 cpuset cgroup is allowed to mount with noprefix. // See kernel source: https://github.com/torvalds/linux/blob/2e1b3cc9d7f790145a80cb705b168f05dab65df2/kernel/cgroup/cgroup-v1.c#L1070 // Cpuset cannot be mounted with and without prefix simultaneously. // Commonly used in Android environments. if cpusetFastPath { return cpusetPrefix + name } err := unix.Access(filepath.Join(path, cpusetPrefix+name), unix.F_OK) if err == nil { // Use the fast path only if we can access one type of mount for cpuset already cpusetFastPath = true } else { err = unix.Access(filepath.Join(path, name), unix.F_OK) if err == nil { cpusetPrefix = "" cpusetFastPath = true } } return cpusetPrefix + name } type CpusetGroup struct{} func (s *CpusetGroup) Name() string { return "cpuset" } func (s *CpusetGroup) Apply(path string, r *cgroups.Resources, pid int) error { return s.ApplyDir(path, r, pid) } func (s *CpusetGroup) Set(path string, r *cgroups.Resources) error { if r.CpusetCpus != "" { if err := cgroups.WriteFile(path, cpusetFile(path, "cpus"), r.CpusetCpus); err != nil { return err } } if r.CpusetMems != "" { if err := cgroups.WriteFile(path, cpusetFile(path, "mems"), r.CpusetMems); err != nil { return err } } return nil } func getCpusetStat(path string, file string) ([]uint16, error) { var extracted []uint16 fileContent, err := fscommon.GetCgroupParamString(path, file) if err != nil { return extracted, err } if len(fileContent) == 0 { return extracted, &parseError{Path: path, File: file, Err: errors.New("empty file")} } for _, s := range strings.Split(fileContent, ",") { fromStr, toStr, ok := strings.Cut(s, "-") if ok { from, err := strconv.ParseUint(fromStr, 10, 16) if err != nil { return extracted, &parseError{Path: path, File: file, Err: err} } to, err := strconv.ParseUint(toStr, 10, 16) if err != nil { return extracted, &parseError{Path: path, File: file, Err: err} } if from > to { return extracted, &parseError{Path: path, File: file, Err: errors.New("invalid values, from > to")} } for i := from; i <= to; i++ { extracted = append(extracted, uint16(i)) } } else { value, err := strconv.ParseUint(s, 10, 16) if err != nil { return extracted, &parseError{Path: path, File: file, Err: err} } extracted = append(extracted, uint16(value)) } } return extracted, nil } func (s *CpusetGroup) GetStats(path string, stats *cgroups.Stats) error { var err error stats.CPUSetStats.CPUs, err = getCpusetStat(path, cpusetFile(path, "cpus")) if err != nil && !errors.Is(err, os.ErrNotExist) { return err } stats.CPUSetStats.CPUExclusive, err = fscommon.GetCgroupParamUint(path, cpusetFile(path, "cpu_exclusive")) if err != nil && !errors.Is(err, os.ErrNotExist) { return err } stats.CPUSetStats.Mems, err = getCpusetStat(path, cpusetFile(path, "mems")) if err != nil && !errors.Is(err, os.ErrNotExist) { return err } stats.CPUSetStats.MemHardwall, err = fscommon.GetCgroupParamUint(path, cpusetFile(path, "mem_hardwall")) if err != nil && !errors.Is(err, os.ErrNotExist) { return err } stats.CPUSetStats.MemExclusive, err = fscommon.GetCgroupParamUint(path, cpusetFile(path, "mem_exclusive")) if err != nil && !errors.Is(err, os.ErrNotExist) { return err } stats.CPUSetStats.MemoryMigrate, err = fscommon.GetCgroupParamUint(path, cpusetFile(path, "memory_migrate")) if err != nil && !errors.Is(err, os.ErrNotExist) { return err } stats.CPUSetStats.MemorySpreadPage, err = fscommon.GetCgroupParamUint(path, cpusetFile(path, "memory_spread_page")) if err != nil && !errors.Is(err, os.ErrNotExist) { return err } stats.CPUSetStats.MemorySpreadSlab, err = fscommon.GetCgroupParamUint(path, cpusetFile(path, "memory_spread_slab")) if err != nil && !errors.Is(err, os.ErrNotExist) { return err } stats.CPUSetStats.MemoryPressure, err = fscommon.GetCgroupParamUint(path, cpusetFile(path, "memory_pressure")) if err != nil && !errors.Is(err, os.ErrNotExist) { return err } stats.CPUSetStats.SchedLoadBalance, err = fscommon.GetCgroupParamUint(path, cpusetFile(path, "sched_load_balance")) if err != nil && !errors.Is(err, os.ErrNotExist) { return err } stats.CPUSetStats.SchedRelaxDomainLevel, err = fscommon.GetCgroupParamInt(path, cpusetFile(path, "sched_relax_domain_level")) if err != nil && !errors.Is(err, os.ErrNotExist) { return err } return nil } func (s *CpusetGroup) ApplyDir(dir string, r *cgroups.Resources, pid int) error { // This might happen if we have no cpuset cgroup mounted. // Just do nothing and don't fail. if dir == "" { return nil } // 'ensureParent' start with parent because we don't want to // explicitly inherit from parent, it could conflict with // 'cpuset.cpu_exclusive'. if err := cpusetEnsureParent(filepath.Dir(dir)); err != nil { return err } if err := os.Mkdir(dir, 0o755); err != nil && !os.IsExist(err) { return err } // We didn't inherit cpuset configs from parent, but we have // to ensure cpuset configs are set before moving task into the // cgroup. // The logic is, if user specified cpuset configs, use these // specified configs, otherwise, inherit from parent. This makes // cpuset configs work correctly with 'cpuset.cpu_exclusive', and // keep backward compatibility. if err := s.ensureCpusAndMems(dir, r); err != nil { return err } // Since we are not using apply(), we need to place the pid // into the procs file. return cgroups.WriteCgroupProc(dir, pid) } func getCpusetSubsystemSettings(parent string) (cpus, mems string, err error) { if cpus, err = cgroups.ReadFile(parent, cpusetFile(parent, "cpus")); err != nil { return } if mems, err = cgroups.ReadFile(parent, cpusetFile(parent, "mems")); err != nil { return } return cpus, mems, nil } // cpusetEnsureParent makes sure that the parent directories of current // are created and populated with the proper cpus and mems files copied // from their respective parent. It does that recursively, starting from // the top of the cpuset hierarchy (i.e. cpuset cgroup mount point). func cpusetEnsureParent(current string) error { var st unix.Statfs_t parent := filepath.Dir(current) err := unix.Statfs(parent, &st) if err == nil && st.Type != unix.CGROUP_SUPER_MAGIC { return nil } // Treat non-existing directory as cgroupfs as it will be created, // and the root cpuset directory obviously exists. if err != nil && err != unix.ENOENT { return &os.PathError{Op: "statfs", Path: parent, Err: err} } if err := cpusetEnsureParent(parent); err != nil { return err } if err := os.Mkdir(current, 0o755); err != nil && !os.IsExist(err) { return err } return cpusetCopyIfNeeded(current, parent) } // cpusetCopyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent // directory to the current directory if the file's contents are 0 func cpusetCopyIfNeeded(current, parent string) error { currentCpus, currentMems, err := getCpusetSubsystemSettings(current) if err != nil { return err } parentCpus, parentMems, err := getCpusetSubsystemSettings(parent) if err != nil { return err } if isEmptyCpuset(currentCpus) { if err := cgroups.WriteFile(current, cpusetFile(current, "cpus"), parentCpus); err != nil { return err } } if isEmptyCpuset(currentMems) { if err := cgroups.WriteFile(current, cpusetFile(current, "mems"), parentMems); err != nil { return err } } return nil } func isEmptyCpuset(str string) bool { return str == "" || str == "\n" } func (s *CpusetGroup) ensureCpusAndMems(path string, r *cgroups.Resources) error { if err := s.Set(path, r); err != nil { return err } return cpusetCopyIfNeeded(path, filepath.Dir(path)) } cgroups-0.0.4/fs/cpuset_test.go000066400000000000000000000131361503527177300165140ustar00rootroot00000000000000package fs import ( "maps" "reflect" "testing" "github.com/opencontainers/cgroups" "github.com/opencontainers/cgroups/fscommon" ) const ( cpus = "0-2,7,12-14\n" cpuExclusive = "1\n" mems = "1-4,6,9\n" memHardwall = "0\n" memExclusive = "0\n" memoryMigrate = "1\n" memorySpreadPage = "0\n" memorySpeadSlab = "1\n" memoryPressure = "34377\n" schedLoadBalance = "1\n" schedRelaxDomainLevel = "-1\n" ) var cpusetTestFiles = map[string]string{ "cpuset.cpus": cpus, "cpuset.cpu_exclusive": cpuExclusive, "cpuset.mems": mems, "cpuset.mem_hardwall": memHardwall, "cpuset.mem_exclusive": memExclusive, "cpuset.memory_migrate": memoryMigrate, "cpuset.memory_spread_page": memorySpreadPage, "cpuset.memory_spread_slab": memorySpeadSlab, "cpuset.memory_pressure": memoryPressure, "cpuset.sched_load_balance": schedLoadBalance, "cpuset.sched_relax_domain_level": schedRelaxDomainLevel, } func TestCPUSetSetCpus(t *testing.T) { path := tempDir(t, "cpuset") const ( cpusBefore = "0" cpusAfter = "1-3" ) writeFileContents(t, path, map[string]string{ "cpuset.cpus": cpusBefore, }) r := &cgroups.Resources{ CpusetCpus: cpusAfter, } cpuset := &CpusetGroup{} if err := cpuset.Set(path, r); err != nil { t.Fatal(err) } value, err := fscommon.GetCgroupParamString(path, "cpuset.cpus") if err != nil { t.Fatal(err) } if value != cpusAfter { t.Fatal("Got the wrong value, set cpuset.cpus failed.") } } func TestCPUSetSetMems(t *testing.T) { path := tempDir(t, "cpuset") const ( memsBefore = "0" memsAfter = "1" ) writeFileContents(t, path, map[string]string{ "cpuset.mems": memsBefore, }) r := &cgroups.Resources{ CpusetMems: memsAfter, } cpuset := &CpusetGroup{} if err := cpuset.Set(path, r); err != nil { t.Fatal(err) } value, err := fscommon.GetCgroupParamString(path, "cpuset.mems") if err != nil { t.Fatal(err) } if value != memsAfter { t.Fatal("Got the wrong value, set cpuset.mems failed.") } } func TestCPUSetStatsCorrect(t *testing.T) { path := tempDir(t, "cpuset") writeFileContents(t, path, cpusetTestFiles) cpuset := &CpusetGroup{} actualStats := *cgroups.NewStats() err := cpuset.GetStats(path, &actualStats) if err != nil { t.Fatal(err) } expectedStats := cgroups.CPUSetStats{ CPUs: []uint16{0, 1, 2, 7, 12, 13, 14}, CPUExclusive: 1, Mems: []uint16{1, 2, 3, 4, 6, 9}, MemoryMigrate: 1, MemHardwall: 0, MemExclusive: 0, MemorySpreadPage: 0, MemorySpreadSlab: 1, MemoryPressure: 34377, SchedLoadBalance: 1, SchedRelaxDomainLevel: -1, } if !reflect.DeepEqual(expectedStats, actualStats.CPUSetStats) { t.Fatalf("Expected Cpuset stats usage %#v but found %#v", expectedStats, actualStats.CPUSetStats) } } func TestCPUSetStatsMissingFiles(t *testing.T) { for _, testCase := range []struct { desc string filename, contents string removeFile bool }{ { desc: "empty cpus file", filename: "cpuset.cpus", contents: "", removeFile: false, }, { desc: "empty mems file", filename: "cpuset.mems", contents: "", removeFile: false, }, { desc: "corrupted cpus file", filename: "cpuset.cpus", contents: "0-3,*4^2", removeFile: false, }, { desc: "corrupted mems file", filename: "cpuset.mems", contents: "0,1,2-5,8-7", removeFile: false, }, { desc: "missing cpu_exclusive file", filename: "cpuset.cpu_exclusive", contents: "", removeFile: true, }, { desc: "missing memory_migrate file", filename: "cpuset.memory_migrate", contents: "", removeFile: true, }, { desc: "missing mem_hardwall file", filename: "cpuset.mem_hardwall", contents: "", removeFile: true, }, { desc: "missing mem_exclusive file", filename: "cpuset.mem_exclusive", contents: "", removeFile: true, }, { desc: "missing memory_spread_page file", filename: "cpuset.memory_spread_page", contents: "", removeFile: true, }, { desc: "missing memory_spread_slab file", filename: "cpuset.memory_spread_slab", contents: "", removeFile: true, }, { desc: "missing memory_pressure file", filename: "cpuset.memory_pressure", contents: "", removeFile: true, }, { desc: "missing sched_load_balance file", filename: "cpuset.sched_load_balance", contents: "", removeFile: true, }, { desc: "missing sched_relax_domain_level file", filename: "cpuset.sched_relax_domain_level", contents: "", removeFile: true, }, } { t.Run(testCase.desc, func(t *testing.T) { path := tempDir(t, "cpuset") tempCpusetTestFiles := maps.Clone(cpusetTestFiles) if testCase.removeFile { delete(tempCpusetTestFiles, testCase.filename) writeFileContents(t, path, tempCpusetTestFiles) cpuset := &CpusetGroup{} actualStats := *cgroups.NewStats() err := cpuset.GetStats(path, &actualStats) if err != nil { t.Errorf("failed unexpectedly: %q", err) } } else { tempCpusetTestFiles[testCase.filename] = testCase.contents writeFileContents(t, path, tempCpusetTestFiles) cpuset := &CpusetGroup{} actualStats := *cgroups.NewStats() err := cpuset.GetStats(path, &actualStats) if err == nil { t.Error("failed to return expected error") } } }) } } cgroups-0.0.4/fs/devices.go000066400000000000000000000014001503527177300155630ustar00rootroot00000000000000package fs import ( "github.com/opencontainers/cgroups" ) type DevicesGroup struct{} func (s *DevicesGroup) Name() string { return "devices" } func (s *DevicesGroup) Apply(path string, r *cgroups.Resources, pid int) error { if r.SkipDevices { return nil } if path == "" { // Return error here, since devices cgroup // is a hard requirement for container's security. return errSubsystemDoesNotExist } return apply(path, pid) } func (s *DevicesGroup) Set(path string, r *cgroups.Resources) error { if cgroups.DevicesSetV1 == nil { if len(r.Devices) == 0 { return nil } return cgroups.ErrDevicesUnsupported } return cgroups.DevicesSetV1(path, r) } func (s *DevicesGroup) GetStats(path string, stats *cgroups.Stats) error { return nil } cgroups-0.0.4/fs/error.go000066400000000000000000000006051503527177300153000ustar00rootroot00000000000000package fs import ( "fmt" "github.com/opencontainers/cgroups/fscommon" ) type parseError = fscommon.ParseError // malformedLine is used by all cgroupfs file parsers that expect a line // in a particular format but get some garbage instead. func malformedLine(path, file, line string) error { return &parseError{Path: path, File: file, Err: fmt.Errorf("malformed line: %s", line)} } cgroups-0.0.4/fs/freezer.go000066400000000000000000000111721503527177300156120ustar00rootroot00000000000000package fs import ( "errors" "fmt" "os" "strings" "time" "github.com/opencontainers/cgroups" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) type FreezerGroup struct{} func (s *FreezerGroup) Name() string { return "freezer" } func (s *FreezerGroup) Apply(path string, _ *cgroups.Resources, pid int) error { return apply(path, pid) } func (s *FreezerGroup) Set(path string, r *cgroups.Resources) (Err error) { switch r.Freezer { case cgroups.Frozen: defer func() { if Err != nil { // Freezing failed, and it is bad and dangerous // to leave the cgroup in FROZEN or FREEZING // state, so (try to) thaw it back. _ = cgroups.WriteFile(path, "freezer.state", string(cgroups.Thawed)) } }() // As per older kernel docs (freezer-subsystem.txt before // kernel commit ef9fe980c6fcc1821), if FREEZING is seen, // userspace should either retry or thaw. While current // kernel cgroup v1 docs no longer mention a need to retry, // even a recent kernel (v5.4, Ubuntu 20.04) can't reliably // freeze a cgroup v1 while new processes keep appearing in it // (either via fork/clone or by writing new PIDs to // cgroup.procs). // // The numbers below are empirically chosen to have a decent // chance to succeed in various scenarios ("runc pause/unpause // with parallel runc exec" and "bare freeze/unfreeze on a very // slow system"), tested on RHEL7 and Ubuntu 20.04 kernels. // // Adding any amount of sleep in between retries did not // increase the chances of successful freeze in "pause/unpause // with parallel exec" reproducer. OTOH, adding an occasional // sleep helped for the case where the system is extremely slow // (CentOS 7 VM on GHA CI). // // Alas, this is still a game of chances, since the real fix // belong to the kernel (cgroup v2 do not have this bug). for i := range 1000 { if i%50 == 49 { // Occasional thaw and sleep improves // the chances to succeed in freezing // in case new processes keep appearing // in the cgroup. _ = cgroups.WriteFile(path, "freezer.state", string(cgroups.Thawed)) time.Sleep(10 * time.Millisecond) } if err := cgroups.WriteFile(path, "freezer.state", string(cgroups.Frozen)); err != nil { return err } if i%25 == 24 { // Occasional short sleep before reading // the state back also improves the chances to // succeed in freezing in case of a very slow // system. time.Sleep(10 * time.Microsecond) } state, err := cgroups.ReadFile(path, "freezer.state") if err != nil { return err } state = strings.TrimSpace(state) switch state { case "FREEZING": continue case string(cgroups.Frozen): if i > 1 { logrus.Debugf("frozen after %d retries", i) } return nil default: // should never happen return fmt.Errorf("unexpected state %s while freezing", strings.TrimSpace(state)) } } // Despite our best efforts, it got stuck in FREEZING. return errors.New("unable to freeze") case cgroups.Thawed: return cgroups.WriteFile(path, "freezer.state", string(cgroups.Thawed)) case cgroups.Undefined: return nil default: return fmt.Errorf("Invalid argument '%s' to freezer.state", string(r.Freezer)) } } func (s *FreezerGroup) GetStats(path string, stats *cgroups.Stats) error { return nil } func (s *FreezerGroup) GetState(path string) (cgroups.FreezerState, error) { for { state, err := cgroups.ReadFile(path, "freezer.state") if err != nil { // If the kernel is too old, then we just treat the freezer as // being in an "undefined" state. if os.IsNotExist(err) || errors.Is(err, unix.ENODEV) { err = nil } return cgroups.Undefined, err } switch strings.TrimSpace(state) { case "THAWED": return cgroups.Thawed, nil case "FROZEN": // Find out whether the cgroup is frozen directly, // or indirectly via an ancestor. self, err := cgroups.ReadFile(path, "freezer.self_freezing") if err != nil { // If the kernel is too old, then we just treat // it as being frozen. if errors.Is(err, os.ErrNotExist) || errors.Is(err, unix.ENODEV) { err = nil } return cgroups.Frozen, err } switch self { case "0\n": return cgroups.Thawed, nil case "1\n": return cgroups.Frozen, nil default: return cgroups.Undefined, fmt.Errorf(`unknown "freezer.self_freezing" state: %q`, self) } case "FREEZING": // Make sure we get a stable freezer state, so retry if the cgroup // is still undergoing freezing. This should be a temporary delay. time.Sleep(1 * time.Millisecond) continue default: return cgroups.Undefined, fmt.Errorf("unknown freezer.state %q", state) } } } cgroups-0.0.4/fs/freezer_test.go000066400000000000000000000016771503527177300166620ustar00rootroot00000000000000package fs import ( "testing" "github.com/opencontainers/cgroups" "github.com/opencontainers/cgroups/fscommon" ) func TestFreezerSetState(t *testing.T) { path := tempDir(t, "freezer") writeFileContents(t, path, map[string]string{ "freezer.state": string(cgroups.Frozen), }) r := &cgroups.Resources{ Freezer: cgroups.Thawed, } freezer := &FreezerGroup{} if err := freezer.Set(path, r); err != nil { t.Fatal(err) } value, err := fscommon.GetCgroupParamString(path, "freezer.state") if err != nil { t.Fatal(err) } if value != string(cgroups.Thawed) { t.Fatal("Got the wrong value, set freezer.state failed.") } } func TestFreezerSetInvalidState(t *testing.T) { path := tempDir(t, "freezer") const invalidArg cgroups.FreezerState = "Invalid" r := &cgroups.Resources{ Freezer: invalidArg, } freezer := &FreezerGroup{} if err := freezer.Set(path, r); err == nil { t.Fatal("Failed to return invalid argument error") } } cgroups-0.0.4/fs/fs.go000066400000000000000000000151071503527177300145620ustar00rootroot00000000000000package fs import ( "errors" "fmt" "os" "sync" "golang.org/x/sys/unix" "github.com/opencontainers/cgroups" "github.com/opencontainers/cgroups/fscommon" ) var subsystems = []subsystem{ &CpusetGroup{}, &DevicesGroup{}, &MemoryGroup{}, &CpuGroup{}, &CpuacctGroup{}, &PidsGroup{}, &BlkioGroup{}, &HugetlbGroup{}, &NetClsGroup{}, &NetPrioGroup{}, &PerfEventGroup{}, &FreezerGroup{}, &RdmaGroup{}, &NameGroup{GroupName: "name=systemd", Join: true}, &NameGroup{GroupName: "misc", Join: true}, } var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist") func init() { // If using cgroups-hybrid mode then add a "" controller indicating // it should join the cgroups v2. if cgroups.IsCgroup2HybridMode() { subsystems = append(subsystems, &NameGroup{GroupName: "", Join: true}) } } type subsystem interface { // Name returns the name of the subsystem. Name() string // GetStats fills in the stats for the subsystem. GetStats(path string, stats *cgroups.Stats) error // Apply creates and joins a cgroup, adding pid into it. Some // subsystems use resources to pre-configure the cgroup parents // before creating or joining it. Apply(path string, r *cgroups.Resources, pid int) error // Set sets the cgroup resources. Set(path string, r *cgroups.Resources) error } type Manager struct { mu sync.Mutex cgroups *cgroups.Cgroup paths map[string]string } func NewManager(cg *cgroups.Cgroup, paths map[string]string) (*Manager, error) { // Some v1 controllers (cpu, cpuset, and devices) expect // cgroups.Resources to not be nil in Apply. if cg.Resources == nil { return nil, errors.New("cgroup v1 manager needs cgroups.Resources to be set during manager creation") } if cg.Resources.Unified != nil { return nil, cgroups.ErrV1NoUnified } if paths == nil { var err error paths, err = initPaths(cg) if err != nil { return nil, err } } return &Manager{ cgroups: cg, paths: paths, }, nil } // isIgnorableError returns whether err is a permission error (in the loose // sense of the word). This includes EROFS (which for an unprivileged user is // basically a permission error) and EACCES (for similar reasons) as well as // the normal EPERM. func isIgnorableError(rootless bool, err error) bool { // We do not ignore errors if we are root. if !rootless { return false } // Is it an ordinary EPERM? if errors.Is(err, os.ErrPermission) { return true } // Handle some specific syscall errors. var errno unix.Errno if errors.As(err, &errno) { return errno == unix.EROFS || errno == unix.EPERM || errno == unix.EACCES } return false } func (m *Manager) Apply(pid int) (retErr error) { m.mu.Lock() defer m.mu.Unlock() c := m.cgroups for _, sys := range subsystems { name := sys.Name() p, ok := m.paths[name] if !ok { continue } if err := sys.Apply(p, c.Resources, pid); err != nil { // In the case of rootless (including euid=0 in userns), where an // explicit cgroup path hasn't been set, we don't bail on error in // case of permission problems here, but do delete the path from // the m.paths map, since it is either non-existent and could not // be created, or the pid could not be added to it. // // Cases where limits for the subsystem have been set are handled // later by Set, which fails with a friendly error (see // if path == "" in Set). if isIgnorableError(c.Rootless, err) && c.Path == "" { retErr = cgroups.ErrRootless delete(m.paths, name) continue } return err } } return retErr } func (m *Manager) Destroy() error { m.mu.Lock() defer m.mu.Unlock() return cgroups.RemovePaths(m.paths) } func (m *Manager) Path(subsys string) string { m.mu.Lock() defer m.mu.Unlock() return m.paths[subsys] } func (m *Manager) GetStats() (*cgroups.Stats, error) { m.mu.Lock() defer m.mu.Unlock() stats := cgroups.NewStats() for _, sys := range subsystems { path := m.paths[sys.Name()] if path == "" { continue } if err := sys.GetStats(path, stats); err != nil { return nil, err } } return stats, nil } func (m *Manager) Set(r *cgroups.Resources) error { if r == nil { return nil } if r.Unified != nil { return cgroups.ErrV1NoUnified } m.mu.Lock() defer m.mu.Unlock() for _, sys := range subsystems { path := m.paths[sys.Name()] if err := sys.Set(path, r); err != nil { // When rootless is true, errors from the device subsystem // are ignored, as it is really not expected to work. if m.cgroups.Rootless && sys.Name() == "devices" && !errors.Is(err, cgroups.ErrDevicesUnsupported) { continue } // However, errors from other subsystems are not ignored. // see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" if path == "" { // We never created a path for this cgroup, so we cannot set // limits for it (though we have already tried at this point). return fmt.Errorf("cannot set %s limit: container could not join or create cgroup", sys.Name()) } return err } } return nil } // Freeze toggles the container's freezer cgroup depending on the state // provided func (m *Manager) Freeze(state cgroups.FreezerState) error { path := m.Path("freezer") if path == "" { return errors.New("cannot toggle freezer: cgroups not configured for container") } prevState := m.cgroups.Resources.Freezer m.cgroups.Resources.Freezer = state freezer := &FreezerGroup{} if err := freezer.Set(path, m.cgroups.Resources); err != nil { m.cgroups.Resources.Freezer = prevState return err } return nil } func (m *Manager) GetPids() ([]int, error) { return cgroups.GetPids(m.Path("devices")) } func (m *Manager) GetAllPids() ([]int, error) { return cgroups.GetAllPids(m.Path("devices")) } func (m *Manager) GetPaths() map[string]string { m.mu.Lock() defer m.mu.Unlock() return m.paths } func (m *Manager) GetCgroups() (*cgroups.Cgroup, error) { return m.cgroups, nil } func (m *Manager) GetFreezerState() (cgroups.FreezerState, error) { dir := m.Path("freezer") // If the container doesn't have the freezer cgroup, say it's undefined. if dir == "" { return cgroups.Undefined, nil } freezer := &FreezerGroup{} return freezer.GetState(dir) } func (m *Manager) Exists() bool { return cgroups.PathExists(m.Path("devices")) } func OOMKillCount(path string) (uint64, error) { return fscommon.GetValueByKey(path, "memory.oom_control", "oom_kill") } func (m *Manager) OOMKillCount() (uint64, error) { c, err := OOMKillCount(m.Path("memory")) // Ignore ENOENT when rootless as it couldn't create cgroup. if err != nil && m.cgroups.Rootless && os.IsNotExist(err) { err = nil } return c, err } cgroups-0.0.4/fs/fs_test.go000066400000000000000000000015221503527177300156150ustar00rootroot00000000000000package fs import ( "testing" "github.com/opencontainers/cgroups" ) func BenchmarkGetStats(b *testing.B) { if cgroups.IsCgroup2UnifiedMode() { b.Skip("cgroup v2 is not supported") } // Unset TestMode as we work with real cgroupfs here, // and we want OpenFile to perform the fstype check. cgroups.TestMode = false defer func() { cgroups.TestMode = true }() cg := &cgroups.Cgroup{ Path: "/some/kind/of/a/path/here", Resources: &cgroups.Resources{}, } m, err := NewManager(cg, nil) if err != nil { b.Fatal(err) } err = m.Apply(-1) if err != nil { b.Fatal(err) } defer func() { _ = m.Destroy() }() var st *cgroups.Stats b.ResetTimer() for i := 0; i < b.N; i++ { st, err = m.GetStats() if err != nil { b.Fatal(err) } } if st.CpuStats.CpuUsage.TotalUsage != 0 { b.Fatalf("stats: %+v", st) } } cgroups-0.0.4/fs/hugetlb.go000066400000000000000000000033361503527177300156050ustar00rootroot00000000000000package fs import ( "errors" "os" "strconv" "github.com/opencontainers/cgroups" "github.com/opencontainers/cgroups/fscommon" ) type HugetlbGroup struct{} func (s *HugetlbGroup) Name() string { return "hugetlb" } func (s *HugetlbGroup) Apply(path string, _ *cgroups.Resources, pid int) error { return apply(path, pid) } func (s *HugetlbGroup) Set(path string, r *cgroups.Resources) error { const suffix = ".limit_in_bytes" skipRsvd := false for _, hugetlb := range r.HugetlbLimit { prefix := "hugetlb." + hugetlb.Pagesize val := strconv.FormatUint(hugetlb.Limit, 10) if err := cgroups.WriteFile(path, prefix+suffix, val); err != nil { return err } if skipRsvd { continue } if err := cgroups.WriteFile(path, prefix+".rsvd"+suffix, val); err != nil { if errors.Is(err, os.ErrNotExist) { skipRsvd = true continue } return err } } return nil } func (s *HugetlbGroup) GetStats(path string, stats *cgroups.Stats) error { if !cgroups.PathExists(path) { return nil } rsvd := ".rsvd" hugetlbStats := cgroups.HugetlbStats{} for _, pageSize := range cgroups.HugePageSizes() { again: prefix := "hugetlb." + pageSize + rsvd value, err := fscommon.GetCgroupParamUint(path, prefix+".usage_in_bytes") if err != nil { if rsvd != "" && errors.Is(err, os.ErrNotExist) { rsvd = "" goto again } return err } hugetlbStats.Usage = value value, err = fscommon.GetCgroupParamUint(path, prefix+".max_usage_in_bytes") if err != nil { return err } hugetlbStats.MaxUsage = value value, err = fscommon.GetCgroupParamUint(path, prefix+".failcnt") if err != nil { return err } hugetlbStats.Failcnt = value stats.HugetlbStats[pageSize] = hugetlbStats } return nil } cgroups-0.0.4/fs/hugetlb_test.go000066400000000000000000000107071503527177300166440ustar00rootroot00000000000000package fs import ( "fmt" "strconv" "testing" "github.com/opencontainers/cgroups" "github.com/opencontainers/cgroups/fscommon" ) const ( hugetlbUsageContents = "128\n" hugetlbMaxUsageContents = "256\n" hugetlbFailcnt = "100\n" ) const ( usage = "hugetlb.%s.usage_in_bytes" limit = "hugetlb.%s.limit_in_bytes" maxUsage = "hugetlb.%s.max_usage_in_bytes" failcnt = "hugetlb.%s.failcnt" rsvdUsage = "hugetlb.%s.rsvd.usage_in_bytes" rsvdLimit = "hugetlb.%s.rsvd.limit_in_bytes" rsvdMaxUsage = "hugetlb.%s.rsvd.max_usage_in_bytes" rsvdFailcnt = "hugetlb.%s.rsvd.failcnt" ) func TestHugetlbSetHugetlb(t *testing.T) { path := tempDir(t, "hugetlb") const ( hugetlbBefore = 256 hugetlbAfter = 512 ) for _, pageSize := range cgroups.HugePageSizes() { writeFileContents(t, path, map[string]string{ fmt.Sprintf(limit, pageSize): strconv.Itoa(hugetlbBefore), }) } r := &cgroups.Resources{} for _, pageSize := range cgroups.HugePageSizes() { r.HugetlbLimit = []*cgroups.HugepageLimit{ { Pagesize: pageSize, Limit: hugetlbAfter, }, } hugetlb := &HugetlbGroup{} if err := hugetlb.Set(path, r); err != nil { t.Fatal(err) } } for _, pageSize := range cgroups.HugePageSizes() { for _, f := range []string{limit, rsvdLimit} { limit := fmt.Sprintf(f, pageSize) value, err := fscommon.GetCgroupParamUint(path, limit) if err != nil { t.Fatal(err) } if value != hugetlbAfter { t.Fatalf("Set %s failed. Expected: %v, Got: %v", limit, hugetlbAfter, value) } } } } func TestHugetlbStats(t *testing.T) { path := tempDir(t, "hugetlb") for _, pageSize := range cgroups.HugePageSizes() { writeFileContents(t, path, map[string]string{ fmt.Sprintf(usage, pageSize): hugetlbUsageContents, fmt.Sprintf(maxUsage, pageSize): hugetlbMaxUsageContents, fmt.Sprintf(failcnt, pageSize): hugetlbFailcnt, }) } hugetlb := &HugetlbGroup{} actualStats := *cgroups.NewStats() err := hugetlb.GetStats(path, &actualStats) if err != nil { t.Fatal(err) } expectedStats := cgroups.HugetlbStats{Usage: 128, MaxUsage: 256, Failcnt: 100} for _, pageSize := range cgroups.HugePageSizes() { expectHugetlbStatEquals(t, expectedStats, actualStats.HugetlbStats[pageSize]) } } func TestHugetlbRStatsRsvd(t *testing.T) { path := tempDir(t, "hugetlb") for _, pageSize := range cgroups.HugePageSizes() { writeFileContents(t, path, map[string]string{ fmt.Sprintf(rsvdUsage, pageSize): hugetlbUsageContents, fmt.Sprintf(rsvdMaxUsage, pageSize): hugetlbMaxUsageContents, fmt.Sprintf(rsvdFailcnt, pageSize): hugetlbFailcnt, }) } hugetlb := &HugetlbGroup{} actualStats := *cgroups.NewStats() err := hugetlb.GetStats(path, &actualStats) if err != nil { t.Fatal(err) } expectedStats := cgroups.HugetlbStats{Usage: 128, MaxUsage: 256, Failcnt: 100} for _, pageSize := range cgroups.HugePageSizes() { expectHugetlbStatEquals(t, expectedStats, actualStats.HugetlbStats[pageSize]) } } func TestHugetlbStatsNoUsageFile(t *testing.T) { path := tempDir(t, "hugetlb") writeFileContents(t, path, map[string]string{ maxUsage: hugetlbMaxUsageContents, }) hugetlb := &HugetlbGroup{} actualStats := *cgroups.NewStats() err := hugetlb.GetStats(path, &actualStats) if err == nil { t.Fatal("Expected failure") } } func TestHugetlbStatsNoMaxUsageFile(t *testing.T) { path := tempDir(t, "hugetlb") for _, pageSize := range cgroups.HugePageSizes() { writeFileContents(t, path, map[string]string{ fmt.Sprintf(usage, pageSize): hugetlbUsageContents, }) } hugetlb := &HugetlbGroup{} actualStats := *cgroups.NewStats() err := hugetlb.GetStats(path, &actualStats) if err == nil { t.Fatal("Expected failure") } } func TestHugetlbStatsBadUsageFile(t *testing.T) { path := tempDir(t, "hugetlb") for _, pageSize := range cgroups.HugePageSizes() { writeFileContents(t, path, map[string]string{ fmt.Sprintf(usage, pageSize): "bad", maxUsage: hugetlbMaxUsageContents, }) } hugetlb := &HugetlbGroup{} actualStats := *cgroups.NewStats() err := hugetlb.GetStats(path, &actualStats) if err == nil { t.Fatal("Expected failure") } } func TestHugetlbStatsBadMaxUsageFile(t *testing.T) { path := tempDir(t, "hugetlb") writeFileContents(t, path, map[string]string{ usage: hugetlbUsageContents, maxUsage: "bad", }) hugetlb := &HugetlbGroup{} actualStats := *cgroups.NewStats() err := hugetlb.GetStats(path, &actualStats) if err == nil { t.Fatal("Expected failure") } } cgroups-0.0.4/fs/memory.go000066400000000000000000000220601503527177300154560ustar00rootroot00000000000000package fs import ( "bufio" "errors" "fmt" "math" "os" "path/filepath" "strconv" "strings" "golang.org/x/sys/unix" "github.com/opencontainers/cgroups" "github.com/opencontainers/cgroups/fscommon" ) const ( cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes" cgroupMemoryLimit = "memory.limit_in_bytes" cgroupMemoryUsage = "memory.usage_in_bytes" cgroupMemoryMaxUsage = "memory.max_usage_in_bytes" ) type MemoryGroup struct{} func (s *MemoryGroup) Name() string { return "memory" } func (s *MemoryGroup) Apply(path string, _ *cgroups.Resources, pid int) error { return apply(path, pid) } func setMemory(path string, val int64) error { if val == 0 { return nil } err := cgroups.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(val, 10)) if !errors.Is(err, unix.EBUSY) { return err } // EBUSY means the kernel can't set new limit as it's too low // (lower than the current usage). Return more specific error. usage, err := fscommon.GetCgroupParamUint(path, cgroupMemoryUsage) if err != nil { return err } max, err := fscommon.GetCgroupParamUint(path, cgroupMemoryMaxUsage) if err != nil { return err } return fmt.Errorf("unable to set memory limit to %d (current usage: %d, peak usage: %d)", val, usage, max) } func setSwap(path string, val int64) error { if val == 0 { return nil } return cgroups.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(val, 10)) } func setMemoryAndSwap(path string, r *cgroups.Resources) error { // If the memory update is set to -1 and the swap is not explicitly // set, we should also set swap to -1, it means unlimited memory. if r.Memory == -1 && r.MemorySwap == 0 { // Only set swap if it's enabled in kernel if cgroups.PathExists(filepath.Join(path, cgroupMemorySwapLimit)) { r.MemorySwap = -1 } } // When memory and swap memory are both set, we need to handle the cases // for updating container. if r.Memory != 0 && r.MemorySwap != 0 { curLimit, err := fscommon.GetCgroupParamUint(path, cgroupMemoryLimit) if err != nil { return err } // When update memory limit, we should adapt the write sequence // for memory and swap memory, so it won't fail because the new // value and the old value don't fit kernel's validation. if r.MemorySwap == -1 || curLimit < uint64(r.MemorySwap) { if err := setSwap(path, r.MemorySwap); err != nil { return err } if err := setMemory(path, r.Memory); err != nil { return err } return nil } } if err := setMemory(path, r.Memory); err != nil { return err } if err := setSwap(path, r.MemorySwap); err != nil { return err } return nil } func (s *MemoryGroup) Set(path string, r *cgroups.Resources) error { if err := setMemoryAndSwap(path, r); err != nil { return err } // ignore KernelMemory and KernelMemoryTCP if r.MemoryReservation != 0 { if err := cgroups.WriteFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(r.MemoryReservation, 10)); err != nil { return err } } if r.OomKillDisable { if err := cgroups.WriteFile(path, "memory.oom_control", "1"); err != nil { return err } } if r.MemorySwappiness == nil || int64(*r.MemorySwappiness) == -1 { return nil } else if *r.MemorySwappiness <= 100 { if err := cgroups.WriteFile(path, "memory.swappiness", strconv.FormatUint(*r.MemorySwappiness, 10)); err != nil { return err } } else { return fmt.Errorf("invalid memory swappiness value: %d (valid range is 0-100)", *r.MemorySwappiness) } return nil } func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error { const file = "memory.stat" statsFile, err := cgroups.OpenFile(path, file, os.O_RDONLY) if err != nil { if os.IsNotExist(err) { return nil } return err } defer statsFile.Close() sc := bufio.NewScanner(statsFile) for sc.Scan() { t, v, err := fscommon.ParseKeyValue(sc.Text()) if err != nil { return &parseError{Path: path, File: file, Err: err} } stats.MemoryStats.Stats[t] = v } stats.MemoryStats.Cache = stats.MemoryStats.Stats["cache"] memoryUsage, err := getMemoryData(path, "") if err != nil { return err } stats.MemoryStats.Usage = memoryUsage swapUsage, err := getMemoryData(path, "memsw") if err != nil { return err } stats.MemoryStats.SwapUsage = swapUsage stats.MemoryStats.SwapOnlyUsage = cgroups.MemoryData{ Usage: swapUsage.Usage - memoryUsage.Usage, Failcnt: swapUsage.Failcnt - memoryUsage.Failcnt, } kernelUsage, err := getMemoryData(path, "kmem") if err != nil { return err } stats.MemoryStats.KernelUsage = kernelUsage kernelTCPUsage, err := getMemoryData(path, "kmem.tcp") if err != nil { return err } stats.MemoryStats.KernelTCPUsage = kernelTCPUsage value, err := fscommon.GetCgroupParamUint(path, "memory.use_hierarchy") if err != nil { return err } if value == 1 { stats.MemoryStats.UseHierarchy = true } pagesByNUMA, err := getPageUsageByNUMA(path) if err != nil { return err } stats.MemoryStats.PageUsageByNUMA = pagesByNUMA return nil } func getMemoryData(path, name string) (cgroups.MemoryData, error) { memoryData := cgroups.MemoryData{} moduleName := "memory" if name != "" { moduleName = "memory." + name } var ( usage = moduleName + ".usage_in_bytes" maxUsage = moduleName + ".max_usage_in_bytes" failcnt = moduleName + ".failcnt" limit = moduleName + ".limit_in_bytes" ) value, err := fscommon.GetCgroupParamUint(path, usage) if err != nil { if name != "" && os.IsNotExist(err) { // Ignore ENOENT as swap and kmem controllers // are optional in the kernel. return cgroups.MemoryData{}, nil } return cgroups.MemoryData{}, err } memoryData.Usage = value value, err = fscommon.GetCgroupParamUint(path, maxUsage) if err != nil { return cgroups.MemoryData{}, err } memoryData.MaxUsage = value value, err = fscommon.GetCgroupParamUint(path, failcnt) if err != nil { return cgroups.MemoryData{}, err } memoryData.Failcnt = value value, err = fscommon.GetCgroupParamUint(path, limit) if err != nil { if name == "kmem" && os.IsNotExist(err) { // Ignore ENOENT as kmem.limit_in_bytes has // been removed in newer kernels. return memoryData, nil } return cgroups.MemoryData{}, err } memoryData.Limit = value return memoryData, nil } func getPageUsageByNUMA(path string) (cgroups.PageUsageByNUMA, error) { const ( maxColumns = math.MaxUint8 + 1 file = "memory.numa_stat" ) stats := cgroups.PageUsageByNUMA{} fd, err := cgroups.OpenFile(path, file, os.O_RDONLY) if os.IsNotExist(err) { return stats, nil } else if err != nil { return stats, err } defer fd.Close() // File format is documented in linux/Documentation/cgroup-v1/memory.txt // and it looks like this: // // total= N0= N1= ... // file= N0= N1= ... // anon= N0= N1= ... // unevictable= N0= N1= ... // hierarchical_= N0= N1= ... scanner := bufio.NewScanner(fd) for scanner.Scan() { var field *cgroups.PageStats line := scanner.Text() columns := strings.SplitN(line, " ", maxColumns) for i, column := range columns { key, val, ok := strings.Cut(column, "=") // Some custom kernels have non-standard fields, like // numa_locality 0 0 0 0 0 0 0 0 0 0 // numa_exectime 0 if !ok { if i == 0 { // Ignore/skip those. break } else { // The first column was already validated, // so be strict to the rest. return stats, malformedLine(path, file, line) } } if i == 0 { // First column: key is name, val is total. field = getNUMAField(&stats, key) if field == nil { // unknown field (new kernel?) break } field.Total, err = strconv.ParseUint(val, 0, 64) if err != nil { return stats, &parseError{Path: path, File: file, Err: err} } field.Nodes = map[uint8]uint64{} } else { // Subsequent columns: key is N, val is usage. if len(key) < 2 || key[0] != 'N' { // This is definitely an error. return stats, malformedLine(path, file, line) } n, err := strconv.ParseUint(key[1:], 10, 8) if err != nil { return stats, &parseError{Path: path, File: file, Err: err} } usage, err := strconv.ParseUint(val, 10, 64) if err != nil { return stats, &parseError{Path: path, File: file, Err: err} } field.Nodes[uint8(n)] = usage } } } if err := scanner.Err(); err != nil { return cgroups.PageUsageByNUMA{}, &parseError{Path: path, File: file, Err: err} } return stats, nil } func getNUMAField(stats *cgroups.PageUsageByNUMA, name string) *cgroups.PageStats { switch name { case "total": return &stats.Total case "file": return &stats.File case "anon": return &stats.Anon case "unevictable": return &stats.Unevictable case "hierarchical_total": return &stats.Hierarchical.Total case "hierarchical_file": return &stats.Hierarchical.File case "hierarchical_anon": return &stats.Hierarchical.Anon case "hierarchical_unevictable": return &stats.Hierarchical.Unevictable } return nil } cgroups-0.0.4/fs/memory_test.go000066400000000000000000000353711503527177300165260ustar00rootroot00000000000000package fs import ( "strconv" "testing" "github.com/opencontainers/cgroups" "github.com/opencontainers/cgroups/fscommon" ) const ( memoryStatContents = `cache 512 rss 1024` memoryUsageContents = "2048\n" memoryMaxUsageContents = "4096\n" memoryFailcnt = "100\n" memoryLimitContents = "8192\n" memoryUseHierarchyContents = "1\n" memoryNUMAStatContents = `total=44611 N0=32631 N1=7501 N2=1982 N3=2497 file=44428 N0=32614 N1=7335 N2=1982 N3=2497 anon=183 N0=17 N1=166 N2=0 N3=0 unevictable=0 N0=0 N1=0 N2=0 N3=0 hierarchical_total=768133 N0=509113 N1=138887 N2=20464 N3=99669 hierarchical_file=722017 N0=496516 N1=119997 N2=20181 N3=85323 hierarchical_anon=46096 N0=12597 N1=18890 N2=283 N3=14326 hierarchical_unevictable=20 N0=0 N1=0 N2=0 N3=20 ` memoryNUMAStatNoHierarchyContents = `total=44611 N0=32631 N1=7501 N2=1982 N3=2497 file=44428 N0=32614 N1=7335 N2=1982 N3=2497 anon=183 N0=17 N1=166 N2=0 N3=0 unevictable=0 N0=0 N1=0 N2=0 N3=0 ` // Some custom kernels has extra fields that should be ignored memoryNUMAStatExtraContents = `numa_locality 0 0 0 0 0 0 0 0 0 0 numa_exectime 0 whatever=100 N0=0 ` ) func TestMemorySetMemory(t *testing.T) { path := tempDir(t, "memory") const ( memoryBefore = 314572800 // 300M memoryAfter = 524288000 // 500M reservationBefore = 209715200 // 200M reservationAfter = 314572800 // 300M ) writeFileContents(t, path, map[string]string{ "memory.limit_in_bytes": strconv.Itoa(memoryBefore), "memory.soft_limit_in_bytes": strconv.Itoa(reservationBefore), }) r := &cgroups.Resources{ Memory: memoryAfter, MemoryReservation: reservationAfter, } memory := &MemoryGroup{} if err := memory.Set(path, r); err != nil { t.Fatal(err) } value, err := fscommon.GetCgroupParamUint(path, "memory.limit_in_bytes") if err != nil { t.Fatal(err) } if value != memoryAfter { t.Fatal("Got the wrong value, set memory.limit_in_bytes failed.") } value, err = fscommon.GetCgroupParamUint(path, "memory.soft_limit_in_bytes") if err != nil { t.Fatal(err) } if value != reservationAfter { t.Fatal("Got the wrong value, set memory.soft_limit_in_bytes failed.") } } func TestMemorySetMemoryswap(t *testing.T) { path := tempDir(t, "memory") const ( memoryswapBefore = 314572800 // 300M memoryswapAfter = 524288000 // 500M ) writeFileContents(t, path, map[string]string{ "memory.memsw.limit_in_bytes": strconv.Itoa(memoryswapBefore), }) r := &cgroups.Resources{ MemorySwap: memoryswapAfter, } memory := &MemoryGroup{} if err := memory.Set(path, r); err != nil { t.Fatal(err) } value, err := fscommon.GetCgroupParamUint(path, "memory.memsw.limit_in_bytes") if err != nil { t.Fatal(err) } if value != memoryswapAfter { t.Fatal("Got the wrong value, set memory.memsw.limit_in_bytes failed.") } } func TestMemorySetMemoryLargerThanSwap(t *testing.T) { path := tempDir(t, "memory") const ( memoryBefore = 314572800 // 300M memoryswapBefore = 524288000 // 500M memoryAfter = 629145600 // 600M memoryswapAfter = 838860800 // 800M ) writeFileContents(t, path, map[string]string{ "memory.limit_in_bytes": strconv.Itoa(memoryBefore), "memory.memsw.limit_in_bytes": strconv.Itoa(memoryswapBefore), // Set will call getMemoryData when memory and swap memory are // both set, fake these fields so we don't get error. "memory.usage_in_bytes": "0", "memory.max_usage_in_bytes": "0", "memory.failcnt": "0", }) r := &cgroups.Resources{ Memory: memoryAfter, MemorySwap: memoryswapAfter, } memory := &MemoryGroup{} if err := memory.Set(path, r); err != nil { t.Fatal(err) } value, err := fscommon.GetCgroupParamUint(path, "memory.limit_in_bytes") if err != nil { t.Fatal(err) } if value != memoryAfter { t.Fatal("Got the wrong value, set memory.limit_in_bytes failed.") } value, err = fscommon.GetCgroupParamUint(path, "memory.memsw.limit_in_bytes") if err != nil { t.Fatal(err) } if value != memoryswapAfter { t.Fatal("Got the wrong value, set memory.memsw.limit_in_bytes failed.") } } func TestMemorySetSwapSmallerThanMemory(t *testing.T) { path := tempDir(t, "memory") const ( memoryBefore = 629145600 // 600M memoryswapBefore = 838860800 // 800M memoryAfter = 314572800 // 300M memoryswapAfter = 524288000 // 500M ) writeFileContents(t, path, map[string]string{ "memory.limit_in_bytes": strconv.Itoa(memoryBefore), "memory.memsw.limit_in_bytes": strconv.Itoa(memoryswapBefore), }) r := &cgroups.Resources{ Memory: memoryAfter, MemorySwap: memoryswapAfter, } memory := &MemoryGroup{} if err := memory.Set(path, r); err != nil { t.Fatal(err) } value, err := fscommon.GetCgroupParamUint(path, "memory.limit_in_bytes") if err != nil { t.Fatal(err) } if value != memoryAfter { t.Fatalf("Got the wrong value (%d != %d), set memory.limit_in_bytes failed", value, memoryAfter) } value, err = fscommon.GetCgroupParamUint(path, "memory.memsw.limit_in_bytes") if err != nil { t.Fatal(err) } if value != memoryswapAfter { t.Fatalf("Got the wrong value (%d != %d), set memory.memsw.limit_in_bytes failed", value, memoryswapAfter) } } func TestMemorySetMemorySwappinessDefault(t *testing.T) { path := tempDir(t, "memory") swappinessBefore := 60 // default is 60 swappinessAfter := uint64(0) writeFileContents(t, path, map[string]string{ "memory.swappiness": strconv.Itoa(swappinessBefore), }) r := &cgroups.Resources{ MemorySwappiness: &swappinessAfter, } memory := &MemoryGroup{} if err := memory.Set(path, r); err != nil { t.Fatal(err) } value, err := fscommon.GetCgroupParamUint(path, "memory.swappiness") if err != nil { t.Fatal(err) } if value != swappinessAfter { t.Fatalf("Got the wrong value (%d), set memory.swappiness = %d failed.", value, swappinessAfter) } } func TestMemoryStats(t *testing.T) { path := tempDir(t, "memory") writeFileContents(t, path, map[string]string{ "memory.stat": memoryStatContents, "memory.usage_in_bytes": memoryUsageContents, "memory.limit_in_bytes": memoryLimitContents, "memory.max_usage_in_bytes": memoryMaxUsageContents, "memory.failcnt": memoryFailcnt, "memory.memsw.usage_in_bytes": memoryUsageContents, "memory.memsw.max_usage_in_bytes": memoryMaxUsageContents, "memory.memsw.failcnt": memoryFailcnt, "memory.memsw.limit_in_bytes": memoryLimitContents, "memory.kmem.usage_in_bytes": memoryUsageContents, "memory.kmem.max_usage_in_bytes": memoryMaxUsageContents, "memory.kmem.failcnt": memoryFailcnt, "memory.kmem.limit_in_bytes": memoryLimitContents, "memory.use_hierarchy": memoryUseHierarchyContents, "memory.numa_stat": memoryNUMAStatContents + memoryNUMAStatExtraContents, }) memory := &MemoryGroup{} actualStats := *cgroups.NewStats() err := memory.GetStats(path, &actualStats) if err != nil { t.Fatal(err) } expectedStats := cgroups.MemoryStats{ Cache: 512, Usage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, SwapUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, SwapOnlyUsage: cgroups.MemoryData{Usage: 0, MaxUsage: 0, Failcnt: 0, Limit: 0}, KernelUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, Stats: map[string]uint64{"cache": 512, "rss": 1024}, UseHierarchy: true, PageUsageByNUMA: cgroups.PageUsageByNUMA{ PageUsageByNUMAInner: cgroups.PageUsageByNUMAInner{ Total: cgroups.PageStats{Total: 44611, Nodes: map[uint8]uint64{0: 32631, 1: 7501, 2: 1982, 3: 2497}}, File: cgroups.PageStats{Total: 44428, Nodes: map[uint8]uint64{0: 32614, 1: 7335, 2: 1982, 3: 2497}}, Anon: cgroups.PageStats{Total: 183, Nodes: map[uint8]uint64{0: 17, 1: 166, 2: 0, 3: 0}}, Unevictable: cgroups.PageStats{Total: 0, Nodes: map[uint8]uint64{0: 0, 1: 0, 2: 0, 3: 0}}, }, Hierarchical: cgroups.PageUsageByNUMAInner{ Total: cgroups.PageStats{Total: 768133, Nodes: map[uint8]uint64{0: 509113, 1: 138887, 2: 20464, 3: 99669}}, File: cgroups.PageStats{Total: 722017, Nodes: map[uint8]uint64{0: 496516, 1: 119997, 2: 20181, 3: 85323}}, Anon: cgroups.PageStats{Total: 46096, Nodes: map[uint8]uint64{0: 12597, 1: 18890, 2: 283, 3: 14326}}, Unevictable: cgroups.PageStats{Total: 20, Nodes: map[uint8]uint64{0: 0, 1: 0, 2: 0, 3: 20}}, }, }, } expectMemoryStatEquals(t, expectedStats, actualStats.MemoryStats) } func TestMemoryStatsNoStatFile(t *testing.T) { path := tempDir(t, "memory") writeFileContents(t, path, map[string]string{ "memory.usage_in_bytes": memoryUsageContents, "memory.max_usage_in_bytes": memoryMaxUsageContents, "memory.limit_in_bytes": memoryLimitContents, }) memory := &MemoryGroup{} actualStats := *cgroups.NewStats() err := memory.GetStats(path, &actualStats) if err != nil { t.Fatal(err) } } func TestMemoryStatsNoUsageFile(t *testing.T) { path := tempDir(t, "memory") writeFileContents(t, path, map[string]string{ "memory.stat": memoryStatContents, "memory.max_usage_in_bytes": memoryMaxUsageContents, "memory.limit_in_bytes": memoryLimitContents, }) memory := &MemoryGroup{} actualStats := *cgroups.NewStats() err := memory.GetStats(path, &actualStats) if err == nil { t.Fatal("Expected failure") } } func TestMemoryStatsNoMaxUsageFile(t *testing.T) { path := tempDir(t, "memory") writeFileContents(t, path, map[string]string{ "memory.stat": memoryStatContents, "memory.usage_in_bytes": memoryUsageContents, "memory.limit_in_bytes": memoryLimitContents, }) memory := &MemoryGroup{} actualStats := *cgroups.NewStats() err := memory.GetStats(path, &actualStats) if err == nil { t.Fatal("Expected failure") } } func TestMemoryStatsNoLimitInBytesFile(t *testing.T) { path := tempDir(t, "memory") writeFileContents(t, path, map[string]string{ "memory.stat": memoryStatContents, "memory.usage_in_bytes": memoryUsageContents, "memory.max_usage_in_bytes": memoryMaxUsageContents, }) memory := &MemoryGroup{} actualStats := *cgroups.NewStats() err := memory.GetStats(path, &actualStats) if err == nil { t.Fatal("Expected failure") } } func TestMemoryStatsBadStatFile(t *testing.T) { path := tempDir(t, "memory") writeFileContents(t, path, map[string]string{ "memory.stat": "rss rss", "memory.usage_in_bytes": memoryUsageContents, "memory.max_usage_in_bytes": memoryMaxUsageContents, "memory.limit_in_bytes": memoryLimitContents, }) memory := &MemoryGroup{} actualStats := *cgroups.NewStats() err := memory.GetStats(path, &actualStats) if err == nil { t.Fatal("Expected failure") } } func TestMemoryStatsBadUsageFile(t *testing.T) { path := tempDir(t, "memory") writeFileContents(t, path, map[string]string{ "memory.stat": memoryStatContents, "memory.usage_in_bytes": "bad", "memory.max_usage_in_bytes": memoryMaxUsageContents, "memory.limit_in_bytes": memoryLimitContents, }) memory := &MemoryGroup{} actualStats := *cgroups.NewStats() err := memory.GetStats(path, &actualStats) if err == nil { t.Fatal("Expected failure") } } func TestMemoryStatsBadMaxUsageFile(t *testing.T) { path := tempDir(t, "memory") writeFileContents(t, path, map[string]string{ "memory.stat": memoryStatContents, "memory.usage_in_bytes": memoryUsageContents, "memory.max_usage_in_bytes": "bad", "memory.limit_in_bytes": memoryLimitContents, }) memory := &MemoryGroup{} actualStats := *cgroups.NewStats() err := memory.GetStats(path, &actualStats) if err == nil { t.Fatal("Expected failure") } } func TestMemoryStatsBadLimitInBytesFile(t *testing.T) { path := tempDir(t, "memory") writeFileContents(t, path, map[string]string{ "memory.stat": memoryStatContents, "memory.usage_in_bytes": memoryUsageContents, "memory.max_usage_in_bytes": memoryMaxUsageContents, "memory.limit_in_bytes": "bad", }) memory := &MemoryGroup{} actualStats := *cgroups.NewStats() err := memory.GetStats(path, &actualStats) if err == nil { t.Fatal("Expected failure") } } func TestMemorySetOomControl(t *testing.T) { path := tempDir(t, "memory") const ( oomKillDisable = 1 // disable oom killer, default is 0 ) writeFileContents(t, path, map[string]string{ "memory.oom_control": strconv.Itoa(oomKillDisable), }) memory := &MemoryGroup{} r := &cgroups.Resources{} if err := memory.Set(path, r); err != nil { t.Fatal(err) } value, err := fscommon.GetCgroupParamUint(path, "memory.oom_control") if err != nil { t.Fatal(err) } if value != oomKillDisable { t.Fatalf("Got the wrong value, set memory.oom_control failed.") } } func TestNoHierarchicalNumaStat(t *testing.T) { path := tempDir(t, "memory") writeFileContents(t, path, map[string]string{ "memory.numa_stat": memoryNUMAStatNoHierarchyContents + memoryNUMAStatExtraContents, }) actualStats, err := getPageUsageByNUMA(path) if err != nil { t.Fatal(err) } pageUsageByNUMA := cgroups.PageUsageByNUMA{ PageUsageByNUMAInner: cgroups.PageUsageByNUMAInner{ Total: cgroups.PageStats{Total: 44611, Nodes: map[uint8]uint64{0: 32631, 1: 7501, 2: 1982, 3: 2497}}, File: cgroups.PageStats{Total: 44428, Nodes: map[uint8]uint64{0: 32614, 1: 7335, 2: 1982, 3: 2497}}, Anon: cgroups.PageStats{Total: 183, Nodes: map[uint8]uint64{0: 17, 1: 166, 2: 0, 3: 0}}, Unevictable: cgroups.PageStats{Total: 0, Nodes: map[uint8]uint64{0: 0, 1: 0, 2: 0, 3: 0}}, }, Hierarchical: cgroups.PageUsageByNUMAInner{}, } expectPageUsageByNUMAEquals(t, pageUsageByNUMA, actualStats) } func TestBadNumaStat(t *testing.T) { memoryNUMAStatBadContents := []struct { desc, contents string }{ { desc: "Nx where x is not a number", contents: `total=44611 N0=44611, file=44428 Nx=0 `, }, { desc: "Nx where x > 255", contents: `total=44611 N333=444`, }, { desc: "Nx argument missing", contents: `total=44611 N0=123 N1=`, }, { desc: "Nx argument is not a number", contents: `total=44611 N0=123 N1=a`, }, { desc: "Missing = after Nx", contents: `total=44611 N0=123 N1`, }, { desc: "No Nx at non-first position", contents: `total=44611 N0=32631 file=44428 N0=32614 anon=183 N0=12 badone `, }, } path := tempDir(t, "memory") for _, c := range memoryNUMAStatBadContents { writeFileContents(t, path, map[string]string{ "memory.numa_stat": c.contents, }) _, err := getPageUsageByNUMA(path) if err == nil { t.Errorf("case %q: expected error, got nil", c.desc) } } } func TestWithoutNumaStat(t *testing.T) { path := tempDir(t, "memory") actualStats, err := getPageUsageByNUMA(path) if err != nil { t.Fatal(err) } expectPageUsageByNUMAEquals(t, cgroups.PageUsageByNUMA{}, actualStats) } cgroups-0.0.4/fs/name.go000066400000000000000000000010251503527177300150640ustar00rootroot00000000000000package fs import ( "github.com/opencontainers/cgroups" ) type NameGroup struct { GroupName string Join bool } func (s *NameGroup) Name() string { return s.GroupName } func (s *NameGroup) Apply(path string, _ *cgroups.Resources, pid int) error { if s.Join { // Ignore errors if the named cgroup does not exist. _ = apply(path, pid) } return nil } func (s *NameGroup) Set(_ string, _ *cgroups.Resources) error { return nil } func (s *NameGroup) GetStats(path string, stats *cgroups.Stats) error { return nil } cgroups-0.0.4/fs/net_cls.go000066400000000000000000000011361503527177300155760ustar00rootroot00000000000000package fs import ( "strconv" "github.com/opencontainers/cgroups" ) type NetClsGroup struct{} func (s *NetClsGroup) Name() string { return "net_cls" } func (s *NetClsGroup) Apply(path string, _ *cgroups.Resources, pid int) error { return apply(path, pid) } func (s *NetClsGroup) Set(path string, r *cgroups.Resources) error { if r.NetClsClassid != 0 { if err := cgroups.WriteFile(path, "net_cls.classid", strconv.FormatUint(uint64(r.NetClsClassid), 10)); err != nil { return err } } return nil } func (s *NetClsGroup) GetStats(path string, stats *cgroups.Stats) error { return nil } cgroups-0.0.4/fs/net_cls_test.go000066400000000000000000000015731503527177300166420ustar00rootroot00000000000000package fs import ( "strconv" "testing" "github.com/opencontainers/cgroups" "github.com/opencontainers/cgroups/fscommon" ) const ( classidBefore = 0x100002 classidAfter = 0x100001 ) func TestNetClsSetClassid(t *testing.T) { path := tempDir(t, "net_cls") writeFileContents(t, path, map[string]string{ "net_cls.classid": strconv.FormatUint(classidBefore, 10), }) r := &cgroups.Resources{ NetClsClassid: classidAfter, } netcls := &NetClsGroup{} if err := netcls.Set(path, r); err != nil { t.Fatal(err) } // As we are in mock environment, we can't get correct value of classid from // net_cls.classid. // So. we just judge if we successfully write classid into file value, err := fscommon.GetCgroupParamUint(path, "net_cls.classid") if err != nil { t.Fatal(err) } if value != classidAfter { t.Fatal("Got the wrong value, set net_cls.classid failed.") } } cgroups-0.0.4/fs/net_prio.go000066400000000000000000000011251503527177300157640ustar00rootroot00000000000000package fs import ( "github.com/opencontainers/cgroups" ) type NetPrioGroup struct{} func (s *NetPrioGroup) Name() string { return "net_prio" } func (s *NetPrioGroup) Apply(path string, _ *cgroups.Resources, pid int) error { return apply(path, pid) } func (s *NetPrioGroup) Set(path string, r *cgroups.Resources) error { for _, prioMap := range r.NetPrioIfpriomap { if err := cgroups.WriteFile(path, "net_prio.ifpriomap", prioMap.CgroupString()); err != nil { return err } } return nil } func (s *NetPrioGroup) GetStats(path string, stats *cgroups.Stats) error { return nil } cgroups-0.0.4/fs/net_prio_test.go000066400000000000000000000012221503527177300170210ustar00rootroot00000000000000package fs import ( "strings" "testing" "github.com/opencontainers/cgroups" "github.com/opencontainers/cgroups/fscommon" ) var prioMap = []*cgroups.IfPrioMap{ { Interface: "test", Priority: 5, }, } func TestNetPrioSetIfPrio(t *testing.T) { path := tempDir(t, "net_prio") r := &cgroups.Resources{ NetPrioIfpriomap: prioMap, } netPrio := &NetPrioGroup{} if err := netPrio.Set(path, r); err != nil { t.Fatal(err) } value, err := fscommon.GetCgroupParamString(path, "net_prio.ifpriomap") if err != nil { t.Fatal(err) } if !strings.Contains(value, "test 5") { t.Fatal("Got the wrong value, set net_prio.ifpriomap failed.") } } cgroups-0.0.4/fs/paths.go000066400000000000000000000071151503527177300152710ustar00rootroot00000000000000package fs import ( "errors" "os" "path/filepath" "sync" "golang.org/x/sys/unix" "github.com/opencontainers/cgroups" "github.com/opencontainers/cgroups/internal/path" ) // The absolute path to the root of the cgroup hierarchies. var ( cgroupRootLock sync.Mutex cgroupRoot string ) const defaultCgroupRoot = "/sys/fs/cgroup" func initPaths(cg *cgroups.Cgroup) (map[string]string, error) { root, err := rootPath() if err != nil { return nil, err } inner, err := path.Inner(cg) if err != nil { return nil, err } paths := make(map[string]string) for _, sys := range subsystems { name := sys.Name() path, err := subsysPath(root, inner, name) if err != nil { // The non-presence of the devices subsystem // is considered fatal for security reasons. if cgroups.IsNotFound(err) && (cg.SkipDevices || name != "devices") { continue } return nil, err } paths[name] = path } return paths, nil } func tryDefaultCgroupRoot() string { var st, pst unix.Stat_t // (1) it should be a directory... err := unix.Lstat(defaultCgroupRoot, &st) if err != nil || st.Mode&unix.S_IFDIR == 0 { return "" } // (2) ... and a mount point ... err = unix.Lstat(filepath.Dir(defaultCgroupRoot), &pst) if err != nil { return "" } if st.Dev == pst.Dev { // parent dir has the same dev -- not a mount point return "" } // (3) ... of 'tmpfs' fs type. var fst unix.Statfs_t err = unix.Statfs(defaultCgroupRoot, &fst) if err != nil || fst.Type != unix.TMPFS_MAGIC { return "" } // (4) it should have at least 1 entry ... dir, err := os.Open(defaultCgroupRoot) if err != nil { return "" } defer dir.Close() names, err := dir.Readdirnames(1) if err != nil { return "" } if len(names) < 1 { return "" } // ... which is a cgroup mount point. err = unix.Statfs(filepath.Join(defaultCgroupRoot, names[0]), &fst) if err != nil || fst.Type != unix.CGROUP_SUPER_MAGIC { return "" } return defaultCgroupRoot } // rootPath finds and returns path to the root of the cgroup hierarchies. func rootPath() (string, error) { cgroupRootLock.Lock() defer cgroupRootLock.Unlock() if cgroupRoot != "" { return cgroupRoot, nil } // fast path cgroupRoot = tryDefaultCgroupRoot() if cgroupRoot != "" { return cgroupRoot, nil } // slow path: parse mountinfo mi, err := cgroups.GetCgroupMounts(false) if err != nil { return "", err } if len(mi) < 1 { return "", errors.New("no cgroup mount found in mountinfo") } // Get the first cgroup mount (e.g. "/sys/fs/cgroup/memory"), // use its parent directory. root := filepath.Dir(mi[0].Mountpoint) if _, err := os.Stat(root); err != nil { return "", err } cgroupRoot = root return cgroupRoot, nil } func subsysPath(root, inner, subsystem string) (string, error) { // If the cgroup name/path is absolute do not look relative to the cgroup of the init process. if filepath.IsAbs(inner) { mnt, err := cgroups.FindCgroupMountpoint(root, subsystem) // If we didn't mount the subsystem, there is no point we make the path. if err != nil { return "", err } // Sometimes subsystems can be mounted together as 'cpu,cpuacct'. return filepath.Join(root, filepath.Base(mnt), inner), nil } // Use GetOwnCgroupPath for dind-like cases, when cgroupns is not // available. This is ugly. parentPath, err := cgroups.GetOwnCgroupPath(subsystem) if err != nil { return "", err } return filepath.Join(parentPath, inner), nil } func apply(path string, pid int) error { if path == "" { return nil } if err := os.MkdirAll(path, 0o755); err != nil { return err } return cgroups.WriteCgroupProc(path, pid) } cgroups-0.0.4/fs/paths_test.go000066400000000000000000000051261503527177300163300ustar00rootroot00000000000000package fs import ( "path/filepath" "strings" "testing" "github.com/opencontainers/cgroups" "github.com/opencontainers/cgroups/internal/path" ) func TestInvalidCgroupPath(t *testing.T) { if cgroups.IsCgroup2UnifiedMode() { t.Skip("cgroup v2 is not supported") } root, err := rootPath() if err != nil { t.Fatalf("couldn't get cgroup root: %v", err) } testCases := []struct { test string path, name, parent string }{ { test: "invalid cgroup path", path: "../../../../../../../../../../some/path", }, { test: "invalid absolute cgroup path", path: "/../../../../../../../../../../some/path", }, { test: "invalid cgroup parent", parent: "../../../../../../../../../../some/path", name: "name", }, { test: "invalid absolute cgroup parent", parent: "/../../../../../../../../../../some/path", name: "name", }, { test: "invalid cgroup name", parent: "parent", name: "../../../../../../../../../../some/path", }, { test: "invalid absolute cgroup name", parent: "parent", name: "/../../../../../../../../../../some/path", }, { test: "invalid cgroup name and parent", parent: "../../../../../../../../../../some/path", name: "../../../../../../../../../../some/path", }, { test: "invalid absolute cgroup name and parent", parent: "/../../../../../../../../../../some/path", name: "/../../../../../../../../../../some/path", }, } for _, tc := range testCases { t.Run(tc.test, func(t *testing.T) { config := &cgroups.Cgroup{Path: tc.path, Name: tc.name, Parent: tc.parent} inner, err := path.Inner(config) if err != nil { t.Fatalf("couldn't get cgroup data: %v", err) } // Make sure the final inner path doesn't go outside the cgroup mountpoint. if strings.HasPrefix(inner, "..") { t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!") } // Double-check, using an actual cgroup. deviceRoot := filepath.Join(root, "devices") devicePath, err := subsysPath(root, inner, "devices") if err != nil { t.Fatalf("couldn't get cgroup path: %v", err) } if !strings.HasPrefix(devicePath, deviceRoot) { t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!") } }) } } func TestTryDefaultCgroupRoot(t *testing.T) { res := tryDefaultCgroupRoot() exp := defaultCgroupRoot if cgroups.IsCgroup2UnifiedMode() { // checking that tryDefaultCgroupRoot does return "" // in case /sys/fs/cgroup is not cgroup v1 root dir. exp = "" } if res != exp { t.Errorf("tryDefaultCgroupRoot: want %q, got %q", exp, res) } } cgroups-0.0.4/fs/perf_event.go000066400000000000000000000006721503527177300163100ustar00rootroot00000000000000package fs import ( "github.com/opencontainers/cgroups" ) type PerfEventGroup struct{} func (s *PerfEventGroup) Name() string { return "perf_event" } func (s *PerfEventGroup) Apply(path string, _ *cgroups.Resources, pid int) error { return apply(path, pid) } func (s *PerfEventGroup) Set(_ string, _ *cgroups.Resources) error { return nil } func (s *PerfEventGroup) GetStats(path string, stats *cgroups.Stats) error { return nil } cgroups-0.0.4/fs/pids.go000066400000000000000000000023571503527177300151140ustar00rootroot00000000000000package fs import ( "math" "strconv" "github.com/opencontainers/cgroups" "github.com/opencontainers/cgroups/fscommon" ) type PidsGroup struct{} func (s *PidsGroup) Name() string { return "pids" } func (s *PidsGroup) Apply(path string, _ *cgroups.Resources, pid int) error { return apply(path, pid) } func (s *PidsGroup) Set(path string, r *cgroups.Resources) error { if r.PidsLimit != 0 { // "max" is the fallback value. limit := "max" if r.PidsLimit > 0 { limit = strconv.FormatInt(r.PidsLimit, 10) } if err := cgroups.WriteFile(path, "pids.max", limit); err != nil { return err } } return nil } func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error { if !cgroups.PathExists(path) { return nil } current, err := fscommon.GetCgroupParamUint(path, "pids.current") if err != nil { return err } max, err := fscommon.GetCgroupParamUint(path, "pids.max") if err != nil { return err } // If no limit is set, read from pids.max returns "max", which is // converted to MaxUint64 by GetCgroupParamUint. Historically, we // represent "no limit" for pids as 0, thus this conversion. if max == math.MaxUint64 { max = 0 } stats.PidsStats.Current = current stats.PidsStats.Limit = max return nil } cgroups-0.0.4/fs/pids_test.go000066400000000000000000000043751503527177300161550ustar00rootroot00000000000000package fs import ( "strconv" "testing" "github.com/opencontainers/cgroups" "github.com/opencontainers/cgroups/fscommon" ) const ( maxUnlimited = -1 maxLimited = 1024 ) func TestPidsSetMax(t *testing.T) { path := tempDir(t, "pids") writeFileContents(t, path, map[string]string{ "pids.max": "max", }) r := &cgroups.Resources{ PidsLimit: maxLimited, } pids := &PidsGroup{} if err := pids.Set(path, r); err != nil { t.Fatal(err) } value, err := fscommon.GetCgroupParamUint(path, "pids.max") if err != nil { t.Fatal(err) } if value != maxLimited { t.Fatalf("Expected %d, got %d for setting pids.max - limited", maxLimited, value) } } func TestPidsSetUnlimited(t *testing.T) { path := tempDir(t, "pids") writeFileContents(t, path, map[string]string{ "pids.max": strconv.Itoa(maxLimited), }) r := &cgroups.Resources{ PidsLimit: maxUnlimited, } pids := &PidsGroup{} if err := pids.Set(path, r); err != nil { t.Fatal(err) } value, err := fscommon.GetCgroupParamString(path, "pids.max") if err != nil { t.Fatal(err) } if value != "max" { t.Fatalf("Expected %s, got %s for setting pids.max - unlimited", "max", value) } } func TestPidsStats(t *testing.T) { path := tempDir(t, "pids") writeFileContents(t, path, map[string]string{ "pids.current": strconv.Itoa(1337), "pids.max": strconv.Itoa(maxLimited), }) pids := &PidsGroup{} stats := *cgroups.NewStats() if err := pids.GetStats(path, &stats); err != nil { t.Fatal(err) } if stats.PidsStats.Current != 1337 { t.Fatalf("Expected %d, got %d for pids.current", 1337, stats.PidsStats.Current) } if stats.PidsStats.Limit != maxLimited { t.Fatalf("Expected %d, got %d for pids.max", maxLimited, stats.PidsStats.Limit) } } func TestPidsStatsUnlimited(t *testing.T) { path := tempDir(t, "pids") writeFileContents(t, path, map[string]string{ "pids.current": strconv.Itoa(4096), "pids.max": "max", }) pids := &PidsGroup{} stats := *cgroups.NewStats() if err := pids.GetStats(path, &stats); err != nil { t.Fatal(err) } if stats.PidsStats.Current != 4096 { t.Fatalf("Expected %d, got %d for pids.current", 4096, stats.PidsStats.Current) } if stats.PidsStats.Limit != 0 { t.Fatalf("Expected %d, got %d for pids.max", 0, stats.PidsStats.Limit) } } cgroups-0.0.4/fs/rdma.go000066400000000000000000000010011503527177300150610ustar00rootroot00000000000000package fs import ( "github.com/opencontainers/cgroups" "github.com/opencontainers/cgroups/fscommon" ) type RdmaGroup struct{} func (s *RdmaGroup) Name() string { return "rdma" } func (s *RdmaGroup) Apply(path string, _ *cgroups.Resources, pid int) error { return apply(path, pid) } func (s *RdmaGroup) Set(path string, r *cgroups.Resources) error { return fscommon.RdmaSet(path, r) } func (s *RdmaGroup) GetStats(path string, stats *cgroups.Stats) error { return fscommon.RdmaGetStats(path, stats) } cgroups-0.0.4/fs/stats_util_test.go000066400000000000000000000124421503527177300174030ustar00rootroot00000000000000package fs import ( "errors" "fmt" "reflect" "testing" "github.com/opencontainers/cgroups" ) func blkioStatEntryEquals(expected, actual []cgroups.BlkioStatEntry) error { if len(expected) != len(actual) { return errors.New("blkioStatEntries length do not match") } for i, expValue := range expected { actValue := actual[i] if expValue != actValue { return fmt.Errorf("expected: %v, actual: %v", expValue, actValue) } } return nil } func expectBlkioStatsEquals(t *testing.T, expected, actual cgroups.BlkioStats) { t.Helper() if err := blkioStatEntryEquals(expected.IoServiceBytesRecursive, actual.IoServiceBytesRecursive); err != nil { t.Errorf("blkio IoServiceBytesRecursive do not match: %s", err) } if err := blkioStatEntryEquals(expected.IoServicedRecursive, actual.IoServicedRecursive); err != nil { t.Errorf("blkio IoServicedRecursive do not match: %s", err) } if err := blkioStatEntryEquals(expected.IoQueuedRecursive, actual.IoQueuedRecursive); err != nil { t.Errorf("blkio IoQueuedRecursive do not match: %s", err) } if err := blkioStatEntryEquals(expected.SectorsRecursive, actual.SectorsRecursive); err != nil { t.Errorf("blkio SectorsRecursive do not match: %s", err) } if err := blkioStatEntryEquals(expected.IoServiceTimeRecursive, actual.IoServiceTimeRecursive); err != nil { t.Errorf("blkio IoServiceTimeRecursive do not match: %s", err) } if err := blkioStatEntryEquals(expected.IoWaitTimeRecursive, actual.IoWaitTimeRecursive); err != nil { t.Errorf("blkio IoWaitTimeRecursive do not match: %s", err) } if err := blkioStatEntryEquals(expected.IoMergedRecursive, actual.IoMergedRecursive); err != nil { t.Errorf("blkio IoMergedRecursive do not match: expected: %v, actual: %v", expected.IoMergedRecursive, actual.IoMergedRecursive) } if err := blkioStatEntryEquals(expected.IoTimeRecursive, actual.IoTimeRecursive); err != nil { t.Errorf("blkio IoTimeRecursive do not match: %s", err) } } func expectThrottlingDataEquals(t *testing.T, expected, actual cgroups.ThrottlingData) { t.Helper() if expected != actual { t.Errorf("Expected throttling data: %v, actual: %v", expected, actual) } } func expectHugetlbStatEquals(t *testing.T, expected, actual cgroups.HugetlbStats) { t.Helper() if expected != actual { t.Errorf("Expected hugetlb stats: %v, actual: %v", expected, actual) } } func expectMemoryStatEquals(t *testing.T, expected, actual cgroups.MemoryStats) { t.Helper() expectMemoryDataEquals(t, expected.Usage, actual.Usage) expectMemoryDataEquals(t, expected.SwapUsage, actual.SwapUsage) expectMemoryDataEquals(t, expected.KernelUsage, actual.KernelUsage) expectPageUsageByNUMAEquals(t, expected.PageUsageByNUMA, actual.PageUsageByNUMA) if expected.UseHierarchy != actual.UseHierarchy { t.Errorf("Expected memory use hierarchy: %v, actual: %v", expected.UseHierarchy, actual.UseHierarchy) } for key, expValue := range expected.Stats { actValue, ok := actual.Stats[key] if !ok { t.Errorf("Expected memory stat key %s not found", key) } if expValue != actValue { t.Errorf("Expected memory stat value: %d, actual: %d", expValue, actValue) } } } func expectMemoryDataEquals(t *testing.T, expected, actual cgroups.MemoryData) { t.Helper() if expected.Usage != actual.Usage { t.Errorf("Expected memory usage: %d, actual: %d", expected.Usage, actual.Usage) } if expected.MaxUsage != actual.MaxUsage { t.Errorf("Expected memory max usage: %d, actual: %d", expected.MaxUsage, actual.MaxUsage) } if expected.Failcnt != actual.Failcnt { t.Errorf("Expected memory failcnt %d, actual: %d", expected.Failcnt, actual.Failcnt) } if expected.Limit != actual.Limit { t.Errorf("Expected memory limit: %d, actual: %d", expected.Limit, actual.Limit) } } func expectPageUsageByNUMAEquals(t *testing.T, expected, actual cgroups.PageUsageByNUMA) { t.Helper() if !reflect.DeepEqual(expected.Total, actual.Total) { t.Errorf("Expected total page usage by NUMA: %#v, actual: %#v", expected.Total, actual.Total) } if !reflect.DeepEqual(expected.File, actual.File) { t.Errorf("Expected file page usage by NUMA: %#v, actual: %#v", expected.File, actual.File) } if !reflect.DeepEqual(expected.Anon, actual.Anon) { t.Errorf("Expected anon page usage by NUMA: %#v, actual: %#v", expected.Anon, actual.Anon) } if !reflect.DeepEqual(expected.Unevictable, actual.Unevictable) { t.Errorf("Expected unevictable page usage by NUMA: %#v, actual: %#v", expected.Unevictable, actual.Unevictable) } if !reflect.DeepEqual(expected.Hierarchical.Total, actual.Hierarchical.Total) { t.Errorf("Expected hierarchical total page usage by NUMA: %#v, actual: %#v", expected.Hierarchical.Total, actual.Hierarchical.Total) } if !reflect.DeepEqual(expected.Hierarchical.File, actual.Hierarchical.File) { t.Errorf("Expected hierarchical file page usage by NUMA: %#v, actual: %#v", expected.Hierarchical.File, actual.Hierarchical.File) } if !reflect.DeepEqual(expected.Hierarchical.Anon, actual.Hierarchical.Anon) { t.Errorf("Expected hierarchical anon page usage by NUMA: %#v, actual: %#v", expected.Hierarchical.Anon, actual.Hierarchical.Anon) } if !reflect.DeepEqual(expected.Hierarchical.Unevictable, actual.Hierarchical.Unevictable) { t.Errorf("Expected hierarchical total page usage by NUMA: %#v, actual: %#v", expected.Hierarchical.Unevictable, actual.Hierarchical.Unevictable) } } cgroups-0.0.4/fs/util_test.go000066400000000000000000000015431503527177300161650ustar00rootroot00000000000000/* Utility for testing cgroup operations. Creates a mock of the cgroup filesystem for the duration of the test. */ package fs import ( "os" "path/filepath" "testing" "github.com/opencontainers/cgroups" ) func init() { cgroups.TestMode = true } // tempDir creates a new test directory for the specified subsystem. func tempDir(t testing.TB, subsystem string) string { path := filepath.Join(t.TempDir(), subsystem) // Ensure the full mock cgroup path exists. if err := os.Mkdir(path, 0o755); err != nil { t.Fatal(err) } return path } // writeFileContents writes the specified contents on the mock of the specified // cgroup files. func writeFileContents(t testing.TB, path string, fileContents map[string]string) { for file, contents := range fileContents { err := cgroups.WriteFile(path, file, contents) if err != nil { t.Fatal(err) } } } cgroups-0.0.4/fs2/000077500000000000000000000000001503527177300137015ustar00rootroot00000000000000cgroups-0.0.4/fs2/cpu.go000066400000000000000000000057421503527177300150270ustar00rootroot00000000000000package fs2 import ( "bufio" "errors" "os" "strconv" "golang.org/x/sys/unix" "github.com/opencontainers/cgroups" "github.com/opencontainers/cgroups/fscommon" ) func isCPUSet(r *cgroups.Resources) bool { return r.CpuWeight != 0 || r.CpuQuota != 0 || r.CpuPeriod != 0 || r.CPUIdle != nil || r.CpuBurst != nil } func setCPU(dirPath string, r *cgroups.Resources) error { if !isCPUSet(r) { return nil } if r.CPUIdle != nil { if err := cgroups.WriteFile(dirPath, "cpu.idle", strconv.FormatInt(*r.CPUIdle, 10)); err != nil { return err } } // NOTE: .CpuShares is not used here. Conversion is the caller's responsibility. if r.CpuWeight != 0 { if err := cgroups.WriteFile(dirPath, "cpu.weight", strconv.FormatUint(r.CpuWeight, 10)); err != nil { return err } } var burst string if r.CpuBurst != nil { burst = strconv.FormatUint(*r.CpuBurst, 10) if err := cgroups.WriteFile(dirPath, "cpu.max.burst", burst); err != nil { // Sometimes when the burst to be set is larger // than the current one, it is rejected by the kernel // (EINVAL) as old_quota/new_burst exceeds the parent // cgroup quota limit. If this happens and the quota is // going to be set, ignore the error for now and retry // after setting the quota. if !errors.Is(err, unix.EINVAL) || r.CpuQuota == 0 { return err } } else { burst = "" } } if r.CpuQuota != 0 || r.CpuPeriod != 0 { str := "max" if r.CpuQuota > 0 { str = strconv.FormatInt(r.CpuQuota, 10) } period := r.CpuPeriod if period == 0 { // This default value is documented in // https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html period = 100000 } str += " " + strconv.FormatUint(period, 10) if err := cgroups.WriteFile(dirPath, "cpu.max", str); err != nil { return err } if burst != "" { if err := cgroups.WriteFile(dirPath, "cpu.max.burst", burst); err != nil { return err } } } return nil } func statCpu(dirPath string, stats *cgroups.Stats) error { const file = "cpu.stat" f, err := cgroups.OpenFile(dirPath, file, os.O_RDONLY) if err != nil { return err } defer f.Close() sc := bufio.NewScanner(f) for sc.Scan() { t, v, err := fscommon.ParseKeyValue(sc.Text()) if err != nil { return &parseError{Path: dirPath, File: file, Err: err} } switch t { case "usage_usec": stats.CpuStats.CpuUsage.TotalUsage = v * 1000 case "user_usec": stats.CpuStats.CpuUsage.UsageInUsermode = v * 1000 case "system_usec": stats.CpuStats.CpuUsage.UsageInKernelmode = v * 1000 case "nr_periods": stats.CpuStats.ThrottlingData.Periods = v case "nr_throttled": stats.CpuStats.ThrottlingData.ThrottledPeriods = v case "throttled_usec": stats.CpuStats.ThrottlingData.ThrottledTime = v * 1000 case "nr_bursts": stats.CpuStats.BurstData.BurstsPeriods = v case "burst_usec": stats.CpuStats.BurstData.BurstTime = v * 1000 } } if err := sc.Err(); err != nil { return &parseError{Path: dirPath, File: file, Err: err} } return nil } cgroups-0.0.4/fs2/cpuset.go000066400000000000000000000010221503527177300155260ustar00rootroot00000000000000package fs2 import ( "github.com/opencontainers/cgroups" ) func isCpusetSet(r *cgroups.Resources) bool { return r.CpusetCpus != "" || r.CpusetMems != "" } func setCpuset(dirPath string, r *cgroups.Resources) error { if !isCpusetSet(r) { return nil } if r.CpusetCpus != "" { if err := cgroups.WriteFile(dirPath, "cpuset.cpus", r.CpusetCpus); err != nil { return err } } if r.CpusetMems != "" { if err := cgroups.WriteFile(dirPath, "cpuset.mems", r.CpusetMems); err != nil { return err } } return nil } cgroups-0.0.4/fs2/create.go000066400000000000000000000107071503527177300155000ustar00rootroot00000000000000package fs2 import ( "fmt" "os" "path/filepath" "strings" "github.com/opencontainers/cgroups" ) func supportedControllers() (string, error) { return cgroups.ReadFile(UnifiedMountpoint, "/cgroup.controllers") } // needAnyControllers returns whether we enable some supported controllers or not, // based on (1) controllers available and (2) resources that are being set. // We don't check "pseudo" controllers such as // "freezer" and "devices". func needAnyControllers(r *cgroups.Resources) (bool, error) { if r == nil { return false, nil } // list of all available controllers content, err := supportedControllers() if err != nil { return false, err } avail := make(map[string]struct{}) for _, ctr := range strings.Fields(content) { avail[ctr] = struct{}{} } // check whether the controller if available or not have := func(controller string) bool { _, ok := avail[controller] return ok } if isPidsSet(r) && have("pids") { return true, nil } if isMemorySet(r) && have("memory") { return true, nil } if isIoSet(r) && have("io") { return true, nil } if isCPUSet(r) && have("cpu") { return true, nil } if isCpusetSet(r) && have("cpuset") { return true, nil } if isHugeTlbSet(r) && have("hugetlb") { return true, nil } return false, nil } // containsDomainController returns whether the current config contains domain controller or not. // Refer to: http://man7.org/linux/man-pages/man7/cgroups.7.html // As at Linux 4.19, the following controllers are threaded: cpu, perf_event, and pids. func containsDomainController(r *cgroups.Resources) bool { return isMemorySet(r) || isIoSet(r) || isCPUSet(r) || isHugeTlbSet(r) } // CreateCgroupPath creates cgroupv2 path, enabling all the supported controllers. func CreateCgroupPath(path string, c *cgroups.Cgroup) (Err error) { if !strings.HasPrefix(path, UnifiedMountpoint) { return fmt.Errorf("invalid cgroup path %s", path) } content, err := supportedControllers() if err != nil { return err } const ( cgTypeFile = "cgroup.type" cgStCtlFile = "cgroup.subtree_control" ) ctrs := strings.Fields(content) res := "+" + strings.Join(ctrs, " +") elements := strings.Split(path, "/") elements = elements[3:] current := "/sys/fs" for i, e := range elements { current = filepath.Join(current, e) if i > 0 { if err := os.Mkdir(current, 0o755); err != nil { if !os.IsExist(err) { return err } } else { // If the directory was created, be sure it is not left around on errors. current := current defer func() { if Err != nil { os.Remove(current) } }() } cgType, _ := cgroups.ReadFile(current, cgTypeFile) cgType = strings.TrimSpace(cgType) switch cgType { // If the cgroup is in an invalid mode (usually this means there's an internal // process in the cgroup tree, because we created a cgroup under an // already-populated-by-other-processes cgroup), then we have to error out if // the user requested controllers which are not thread-aware. However, if all // the controllers requested are thread-aware we can simply put the cgroup into // threaded mode. case "domain invalid": if containsDomainController(c.Resources) { return fmt.Errorf("cannot enter cgroupv2 %q with domain controllers -- it is in an invalid state", current) } else { // Not entirely correct (in theory we'd always want to be a domain -- // since that means we're a properly delegated cgroup subtree) but in // this case there's not much we can do and it's better than giving an // error. _ = cgroups.WriteFile(current, cgTypeFile, "threaded") } // If the cgroup is in (threaded) or (domain threaded) mode, we can only use thread-aware controllers // (and you cannot usually take a cgroup out of threaded mode). case "domain threaded": fallthrough case "threaded": if containsDomainController(c.Resources) { return fmt.Errorf("cannot enter cgroupv2 %q with domain controllers -- it is in %s mode", current, cgType) } } } // enable all supported controllers if i < len(elements)-1 { if err := cgroups.WriteFile(current, cgStCtlFile, res); err != nil { // try write one by one allCtrs := strings.Split(res, " ") for _, ctr := range allCtrs { _ = cgroups.WriteFile(current, cgStCtlFile, ctr) } } // Some controllers might not be enabled when rootless or containerized, // but we don't catch the error here. (Caught in setXXX() functions.) } } return nil } cgroups-0.0.4/fs2/defaultpath.go000066400000000000000000000042011503527177300165260ustar00rootroot00000000000000/* Copyright The containerd Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package fs2 import ( "bufio" "errors" "io" "os" "path/filepath" "strings" "github.com/opencontainers/cgroups" "github.com/opencontainers/cgroups/internal/path" ) const UnifiedMountpoint = "/sys/fs/cgroup" func defaultDirPath(c *cgroups.Cgroup) (string, error) { innerPath, err := path.Inner(c) if err != nil { return "", err } if filepath.IsAbs(innerPath) { return filepath.Join(UnifiedMountpoint, innerPath), nil } // we don't need to use /proc/thread-self here because runc always runs // with every thread in the same cgroup. This lets us avoid having to do // runtime.LockOSThread. ownCgroup, err := parseCgroupFile("/proc/self/cgroup") if err != nil { return "", err } // The current user scope most probably has tasks in it already, // making it impossible to enable controllers for its sub-cgroup. // A parent cgroup (with no tasks in it) is what we need. ownCgroup = filepath.Dir(ownCgroup) return filepath.Join(UnifiedMountpoint, ownCgroup, innerPath), nil } // parseCgroupFile parses /proc/PID/cgroup file and return string func parseCgroupFile(path string) (string, error) { f, err := os.Open(path) if err != nil { return "", err } defer f.Close() return parseCgroupFromReader(f) } func parseCgroupFromReader(r io.Reader) (string, error) { s := bufio.NewScanner(r) for s.Scan() { // "0::/user.slice/user-1001.slice/session-1.scope" if path, ok := strings.CutPrefix(s.Text(), "0::"); ok { return path, nil } } if err := s.Err(); err != nil { return "", err } return "", errors.New("cgroup path not found") } cgroups-0.0.4/fs2/defaultpath_test.go000066400000000000000000000045251503527177300175760ustar00rootroot00000000000000/* Copyright The containerd Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package fs2 import ( "path/filepath" "strings" "testing" "github.com/opencontainers/cgroups" ) func TestParseCgroupFromReader(t *testing.T) { cases := map[string]string{ "0::/user.slice/user-1001.slice/session-1.scope\n": "/user.slice/user-1001.slice/session-1.scope", "2:cpuset:/foo\n1:name=systemd:/\n": "", "2:cpuset:/foo\n1:name=systemd:/\n0::/user.slice/user-1001.slice/session-1.scope\n": "/user.slice/user-1001.slice/session-1.scope", } for s, expected := range cases { g, err := parseCgroupFromReader(strings.NewReader(s)) if expected != "" { if g != expected { t.Errorf("expected %q, got %q", expected, g) } if err != nil { t.Error(err) } } else { if err == nil { t.Error("error is expected") } } } } func TestDefaultDirPath(t *testing.T) { if !cgroups.IsCgroup2UnifiedMode() { t.Skip("need cgroupv2") } // same code as in defaultDirPath() ownCgroup, err := parseCgroupFile("/proc/self/cgroup") if err != nil { // Not a test failure, but rather some weird // environment so we can't run this test. t.Skipf("can't get own cgroup: %v", err) } ownCgroup = filepath.Dir(ownCgroup) cases := []struct { cgPath string cgParent string cgName string expected string }{ { cgPath: "/foo/bar", expected: "/sys/fs/cgroup/foo/bar", }, { cgPath: "foo/bar", expected: filepath.Join(UnifiedMountpoint, ownCgroup, "foo/bar"), }, } for _, c := range cases { cg := &cgroups.Cgroup{ Path: c.cgPath, Parent: c.cgParent, Name: c.cgName, } got, err := defaultDirPath(cg) if err != nil { t.Fatal(err) } if got != c.expected { t.Fatalf("expected %q, got %q", c.expected, got) } } } cgroups-0.0.4/fs2/freezer.go000066400000000000000000000102551503527177300156750ustar00rootroot00000000000000package fs2 import ( "bufio" "errors" "fmt" "os" "strings" "time" "golang.org/x/sys/unix" "github.com/opencontainers/cgroups" ) func setFreezer(dirPath string, state cgroups.FreezerState) error { var stateStr string switch state { case cgroups.Undefined: return nil case cgroups.Frozen: stateStr = "1" case cgroups.Thawed: stateStr = "0" default: return fmt.Errorf("invalid freezer state %q requested", state) } fd, err := cgroups.OpenFile(dirPath, "cgroup.freeze", unix.O_RDWR) if err != nil { // We can ignore this request as long as the user didn't ask us to // freeze the container (since without the freezer cgroup, that's a // no-op). if state != cgroups.Frozen { return nil } return fmt.Errorf("freezer not supported: %w", err) } defer fd.Close() if _, err := fd.WriteString(stateStr); err != nil { return err } // Confirm that the cgroup did actually change states. if actualState, err := readFreezer(dirPath, fd); err != nil { return err } else if actualState != state { return fmt.Errorf(`expected "cgroup.freeze" to be in state %q but was in %q`, state, actualState) } return nil } func getFreezer(dirPath string) (cgroups.FreezerState, error) { fd, err := cgroups.OpenFile(dirPath, "cgroup.freeze", unix.O_RDONLY) if err != nil { // If the kernel is too old, then we just treat the freezer as // being in an "undefined" state and ignore the error. return cgroups.Undefined, ignoreNotExistOrNoDeviceError(err) } defer fd.Close() return readFreezer(dirPath, fd) } func readFreezer(dirPath string, fd *os.File) (cgroups.FreezerState, error) { if _, err := fd.Seek(0, 0); err != nil { // If the cgroup path is deleted at this point, then we just treat the freezer as // being in an "undefined" state and ignore the error. return cgroups.Undefined, ignoreNotExistOrNoDeviceError(err) } state := make([]byte, 2) if _, err := fd.Read(state); err != nil { // If the cgroup path is deleted at this point, then we just treat the freezer as // being in an "undefined" state and ignore the error. return cgroups.Undefined, ignoreNotExistOrNoDeviceError(err) } switch string(state) { case "0\n": return cgroups.Thawed, nil case "1\n": return waitFrozen(dirPath) default: return cgroups.Undefined, fmt.Errorf(`unknown "cgroup.freeze" state: %q`, state) } } // ignoreNotExistOrNoDeviceError checks if the error is either a "not exist" error // or a "no device" error, and returns nil in those cases. Otherwise, it returns the error. func ignoreNotExistOrNoDeviceError(err error) error { // We can safely ignore the error in the following two common situations: // 1. The cgroup path does not exist at the time of opening(eg: the kernel is too old) // — indicated by os.IsNotExist. // 2. The cgroup path is deleted during the seek/read operation — indicated by // errors.Is(err, unix.ENODEV). // These conditions are expected and do not require special handling. if os.IsNotExist(err) || errors.Is(err, unix.ENODEV) { return nil } return err } // waitFrozen polls cgroup.events until it sees "frozen 1" in it. func waitFrozen(dirPath string) (cgroups.FreezerState, error) { fd, err := cgroups.OpenFile(dirPath, "cgroup.events", unix.O_RDONLY) if err != nil { return cgroups.Undefined, err } defer fd.Close() // XXX: Simple wait/read/retry is used here. An implementation // based on poll(2) or inotify(7) is possible, but it makes the code // much more complicated. Maybe address this later. const ( // Perform maxIter with waitTime in between iterations. waitTime = 10 * time.Millisecond maxIter = 1000 ) scanner := bufio.NewScanner(fd) for i := 0; scanner.Scan(); { if i == maxIter { return cgroups.Undefined, fmt.Errorf("timeout of %s reached waiting for the cgroup to freeze", waitTime*maxIter) } if val, ok := strings.CutPrefix(scanner.Text(), "frozen "); ok { if val[0] == '1' { return cgroups.Frozen, nil } i++ // wait, then re-read time.Sleep(waitTime) _, err := fd.Seek(0, 0) if err != nil { return cgroups.Undefined, err } } } // Should only reach here either on read error, // or if the file does not contain "frozen " line. return cgroups.Undefined, scanner.Err() } cgroups-0.0.4/fs2/fs2.go000066400000000000000000000204021503527177300147200ustar00rootroot00000000000000package fs2 import ( "errors" "fmt" "os" "strings" "github.com/opencontainers/cgroups" "github.com/opencontainers/cgroups/fscommon" ) type parseError = fscommon.ParseError type Manager struct { config *cgroups.Cgroup // dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope" dirPath string // controllers is content of "cgroup.controllers" file. // excludes pseudo-controllers ("devices" and "freezer"). controllers map[string]struct{} } // NewManager creates a manager for cgroup v2 unified hierarchy. // dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope". // If dirPath is empty, it is automatically set using config. func NewManager(config *cgroups.Cgroup, dirPath string) (*Manager, error) { if dirPath == "" { var err error dirPath, err = defaultDirPath(config) if err != nil { return nil, err } } m := &Manager{ config: config, dirPath: dirPath, } return m, nil } func (m *Manager) getControllers() error { if m.controllers != nil { return nil } data, err := cgroups.ReadFile(m.dirPath, "cgroup.controllers") if err != nil { if m.config.Rootless && m.config.Path == "" { return nil } return err } fields := strings.Fields(data) m.controllers = make(map[string]struct{}, len(fields)) for _, c := range fields { m.controllers[c] = struct{}{} } return nil } func (m *Manager) Apply(pid int) error { if err := CreateCgroupPath(m.dirPath, m.config); err != nil { // Related tests: // - "runc create (no limits + no cgrouppath + no permission) succeeds" // - "runc create (rootless + no limits + cgrouppath + no permission) fails with permission error" // - "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" if m.config.Rootless { if m.config.Path == "" { if blNeed, nErr := needAnyControllers(m.config.Resources); nErr == nil && !blNeed { return cgroups.ErrRootless } return fmt.Errorf("rootless needs no limits + no cgrouppath when no permission is granted for cgroups: %w", err) } } return err } if err := cgroups.WriteCgroupProc(m.dirPath, pid); err != nil { return err } return nil } func (m *Manager) GetPids() ([]int, error) { return cgroups.GetPids(m.dirPath) } func (m *Manager) GetAllPids() ([]int, error) { return cgroups.GetAllPids(m.dirPath) } func (m *Manager) GetStats() (*cgroups.Stats, error) { var errs []error st := cgroups.NewStats() // pids (since kernel 4.5) if err := statPids(m.dirPath, st); err != nil { errs = append(errs, err) } // memory (since kernel 4.5) if err := statMemory(m.dirPath, st); err != nil && !os.IsNotExist(err) { errs = append(errs, err) } // io (since kernel 4.5) if err := statIo(m.dirPath, st); err != nil && !os.IsNotExist(err) { errs = append(errs, err) } // cpu (since kernel 4.15) // Note cpu.stat is available even if the controller is not enabled. if err := statCpu(m.dirPath, st); err != nil && !os.IsNotExist(err) { errs = append(errs, err) } // PSI (since kernel 4.20). var err error if st.CpuStats.PSI, err = statPSI(m.dirPath, "cpu.pressure"); err != nil { errs = append(errs, err) } if st.MemoryStats.PSI, err = statPSI(m.dirPath, "memory.pressure"); err != nil { errs = append(errs, err) } if st.BlkioStats.PSI, err = statPSI(m.dirPath, "io.pressure"); err != nil { errs = append(errs, err) } // hugetlb (since kernel 5.6) if err := statHugeTlb(m.dirPath, st); err != nil && !os.IsNotExist(err) { errs = append(errs, err) } // rdma (since kernel 4.11) if err := fscommon.RdmaGetStats(m.dirPath, st); err != nil && !os.IsNotExist(err) { errs = append(errs, err) } // misc (since kernel 5.13) if err := statMisc(m.dirPath, st); err != nil && !os.IsNotExist(err) { errs = append(errs, err) } if len(errs) > 0 && !m.config.Rootless { return st, fmt.Errorf("error while statting cgroup v2: %+v", errs) } return st, nil } func (m *Manager) Freeze(state cgroups.FreezerState) error { if m.config.Resources == nil { return errors.New("cannot toggle freezer: cgroups not configured for container") } if err := setFreezer(m.dirPath, state); err != nil { return err } m.config.Resources.Freezer = state return nil } func (m *Manager) Destroy() error { return cgroups.RemovePath(m.dirPath) } func (m *Manager) Path(_ string) string { return m.dirPath } func (m *Manager) Set(r *cgroups.Resources) error { if r == nil { return nil } if err := m.getControllers(); err != nil { return err } // pids (since kernel 4.5) if err := setPids(m.dirPath, r); err != nil { return err } // memory (since kernel 4.5) if err := setMemory(m.dirPath, r); err != nil { return err } // io (since kernel 4.5) if err := setIo(m.dirPath, r); err != nil { return err } // cpu (since kernel 4.15) if err := setCPU(m.dirPath, r); err != nil { return err } // devices (since kernel 4.15, pseudo-controller) // // When rootless is true, errors from the device subsystem are ignored because it is really not expected to work. // However, errors from other subsystems are not ignored. // see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" if err := setDevices(m.dirPath, r); err != nil { if !m.config.Rootless || errors.Is(err, cgroups.ErrDevicesUnsupported) { return err } } // cpuset (since kernel 5.0) if err := setCpuset(m.dirPath, r); err != nil { return err } // hugetlb (since kernel 5.6) if err := setHugeTlb(m.dirPath, r); err != nil { return err } // rdma (since kernel 4.11) if err := fscommon.RdmaSet(m.dirPath, r); err != nil { return err } // freezer (since kernel 5.2, pseudo-controller) if err := setFreezer(m.dirPath, r.Freezer); err != nil { return err } if err := m.setUnified(r.Unified); err != nil { return err } m.config.Resources = r return nil } func setDevices(dirPath string, r *cgroups.Resources) error { if cgroups.DevicesSetV2 == nil { if len(r.Devices) > 0 { return cgroups.ErrDevicesUnsupported } return nil } return cgroups.DevicesSetV2(dirPath, r) } func (m *Manager) setUnified(res map[string]string) error { for k, v := range res { if strings.Contains(k, "/") { return fmt.Errorf("unified resource %q must be a file name (no slashes)", k) } if err := cgroups.WriteFileByLine(m.dirPath, k, v); err != nil { // Check for both EPERM and ENOENT since O_CREAT is used by WriteFile. if errors.Is(err, os.ErrPermission) || errors.Is(err, os.ErrNotExist) { // Check if a controller is available, // to give more specific error if not. c, _, ok := strings.Cut(k, ".") if !ok { return fmt.Errorf("unified resource %q must be in the form CONTROLLER.PARAMETER", k) } if _, ok := m.controllers[c]; !ok && c != "cgroup" { return fmt.Errorf("unified resource %q can't be set: controller %q not available", k, c) } } return fmt.Errorf("unable to set unified resource %q: %w", k, err) } } return nil } func (m *Manager) GetPaths() map[string]string { paths := make(map[string]string, 1) paths[""] = m.dirPath return paths } func (m *Manager) GetCgroups() (*cgroups.Cgroup, error) { return m.config, nil } func (m *Manager) GetFreezerState() (cgroups.FreezerState, error) { return getFreezer(m.dirPath) } func (m *Manager) Exists() bool { return cgroups.PathExists(m.dirPath) } func OOMKillCount(path string) (uint64, error) { return fscommon.GetValueByKey(path, "memory.events", "oom_kill") } func (m *Manager) OOMKillCount() (uint64, error) { c, err := OOMKillCount(m.dirPath) if err != nil && m.config.Rootless && os.IsNotExist(err) { err = nil } return c, err } func CheckMemoryUsage(dirPath string, r *cgroups.Resources) error { if !r.MemoryCheckBeforeUpdate { return nil } if r.Memory <= 0 && r.MemorySwap <= 0 { return nil } usage, err := fscommon.GetCgroupParamUint(dirPath, "memory.current") if err != nil { // This check is on best-effort basis, so if we can't read the // current usage (cgroup not yet created, or any other error), // we should not fail. return nil } if r.MemorySwap > 0 { if uint64(r.MemorySwap) <= usage { return fmt.Errorf("rejecting memory+swap limit %d <= usage %d", r.MemorySwap, usage) } } if r.Memory > 0 { if uint64(r.Memory) <= usage { return fmt.Errorf("rejecting memory limit %d <= usage %d", r.Memory, usage) } } return nil } cgroups-0.0.4/fs2/hugetlb.go000066400000000000000000000026631503527177300156710ustar00rootroot00000000000000package fs2 import ( "errors" "os" "strconv" "github.com/opencontainers/cgroups" "github.com/opencontainers/cgroups/fscommon" ) func isHugeTlbSet(r *cgroups.Resources) bool { return len(r.HugetlbLimit) > 0 } func setHugeTlb(dirPath string, r *cgroups.Resources) error { if !isHugeTlbSet(r) { return nil } const suffix = ".max" skipRsvd := false for _, hugetlb := range r.HugetlbLimit { prefix := "hugetlb." + hugetlb.Pagesize val := strconv.FormatUint(hugetlb.Limit, 10) if err := cgroups.WriteFile(dirPath, prefix+suffix, val); err != nil { return err } if skipRsvd { continue } if err := cgroups.WriteFile(dirPath, prefix+".rsvd"+suffix, val); err != nil { if errors.Is(err, os.ErrNotExist) { skipRsvd = true continue } return err } } return nil } func statHugeTlb(dirPath string, stats *cgroups.Stats) error { hugetlbStats := cgroups.HugetlbStats{} rsvd := ".rsvd" for _, pagesize := range cgroups.HugePageSizes() { prefix := "hugetlb." + pagesize again: value, err := fscommon.GetCgroupParamUint(dirPath, prefix+rsvd+".current") if err != nil { if rsvd != "" && errors.Is(err, os.ErrNotExist) { rsvd = "" goto again } return err } hugetlbStats.Usage = value value, err = fscommon.GetValueByKey(dirPath, prefix+".events", "max") if err != nil { return err } hugetlbStats.Failcnt = value stats.HugetlbStats[pagesize] = hugetlbStats } return nil } cgroups-0.0.4/fs2/io.go000066400000000000000000000117201503527177300146400ustar00rootroot00000000000000package fs2 import ( "bufio" "bytes" "fmt" "os" "strconv" "strings" "github.com/sirupsen/logrus" "github.com/opencontainers/cgroups" ) func isIoSet(r *cgroups.Resources) bool { return r.BlkioWeight != 0 || len(r.BlkioWeightDevice) > 0 || len(r.BlkioThrottleReadBpsDevice) > 0 || len(r.BlkioThrottleWriteBpsDevice) > 0 || len(r.BlkioThrottleReadIOPSDevice) > 0 || len(r.BlkioThrottleWriteIOPSDevice) > 0 } // bfqDeviceWeightSupported checks for per-device BFQ weight support (added // in kernel v5.4, commit 795fe54c2a8) by reading from "io.bfq.weight". func bfqDeviceWeightSupported(bfq *os.File) bool { if bfq == nil { return false } _, _ = bfq.Seek(0, 0) buf := make([]byte, 32) _, _ = bfq.Read(buf) // If only a single number (default weight) if read back, we have older kernel. _, err := strconv.ParseInt(string(bytes.TrimSpace(buf)), 10, 64) return err != nil } func setIo(dirPath string, r *cgroups.Resources) error { if !isIoSet(r) { return nil } // If BFQ IO scheduler is available, use it. var bfq *os.File if r.BlkioWeight != 0 || len(r.BlkioWeightDevice) > 0 { var err error bfq, err = cgroups.OpenFile(dirPath, "io.bfq.weight", os.O_RDWR) if err == nil { defer bfq.Close() } else if !os.IsNotExist(err) { return err } } if r.BlkioWeight != 0 { if bfq != nil { // Use BFQ. if _, err := bfq.WriteString(strconv.FormatUint(uint64(r.BlkioWeight), 10)); err != nil { return err } } else { // Fallback to io.weight with a conversion scheme. v := cgroups.ConvertBlkIOToIOWeightValue(r.BlkioWeight) if err := cgroups.WriteFile(dirPath, "io.weight", strconv.FormatUint(v, 10)); err != nil { return err } } } if bfqDeviceWeightSupported(bfq) { for _, wd := range r.BlkioWeightDevice { if _, err := bfq.WriteString(wd.WeightString() + "\n"); err != nil { return fmt.Errorf("setting device weight %q: %w", wd.WeightString(), err) } } } for _, td := range r.BlkioThrottleReadBpsDevice { if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("rbps")); err != nil { return err } } for _, td := range r.BlkioThrottleWriteBpsDevice { if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("wbps")); err != nil { return err } } for _, td := range r.BlkioThrottleReadIOPSDevice { if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("riops")); err != nil { return err } } for _, td := range r.BlkioThrottleWriteIOPSDevice { if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("wiops")); err != nil { return err } } return nil } func readCgroup2MapFile(dirPath string, name string) (map[string][]string, error) { ret := map[string][]string{} f, err := cgroups.OpenFile(dirPath, name, os.O_RDONLY) if err != nil { return nil, err } defer f.Close() scanner := bufio.NewScanner(f) for scanner.Scan() { line := scanner.Text() parts := strings.Fields(line) if len(parts) < 2 { continue } ret[parts[0]] = parts[1:] } if err := scanner.Err(); err != nil { return nil, &parseError{Path: dirPath, File: name, Err: err} } return ret, nil } func statIo(dirPath string, stats *cgroups.Stats) error { const file = "io.stat" values, err := readCgroup2MapFile(dirPath, file) if err != nil { return err } // more details on the io.stat file format: https://www.kernel.org/doc/Documentation/cgroup-v2.txt var parsedStats cgroups.BlkioStats for k, v := range values { d := strings.Split(k, ":") if len(d) != 2 { continue } major, err := strconv.ParseUint(d[0], 10, 64) if err != nil { return &parseError{Path: dirPath, File: file, Err: err} } minor, err := strconv.ParseUint(d[1], 10, 64) if err != nil { return &parseError{Path: dirPath, File: file, Err: err} } for _, item := range v { d := strings.Split(item, "=") if len(d) != 2 { continue } op := d[0] // Map to the cgroupv1 naming and layout (in separate tables). var targetTable *[]cgroups.BlkioStatEntry switch op { // Equivalent to cgroupv1's blkio.io_service_bytes. case "rbytes": op = "Read" targetTable = &parsedStats.IoServiceBytesRecursive case "wbytes": op = "Write" targetTable = &parsedStats.IoServiceBytesRecursive // Equivalent to cgroupv1's blkio.io_serviced. case "rios": op = "Read" targetTable = &parsedStats.IoServicedRecursive case "wios": op = "Write" targetTable = &parsedStats.IoServicedRecursive default: // Skip over entries we cannot map to cgroupv1 stats for now. // In the future we should expand the stats struct to include // them. logrus.Debugf("cgroupv2 io stats: skipping over unmappable %s entry", item) continue } value, err := strconv.ParseUint(d[1], 10, 64) if err != nil { return &parseError{Path: dirPath, File: file, Err: err} } entry := cgroups.BlkioStatEntry{ Op: op, Major: major, Minor: minor, Value: value, } *targetTable = append(*targetTable, entry) } } stats.BlkioStats = parsedStats return nil } cgroups-0.0.4/fs2/io_test.go000066400000000000000000000046011503527177300156770ustar00rootroot00000000000000package fs2 import ( "os" "path/filepath" "reflect" "sort" "testing" "github.com/opencontainers/cgroups" ) const exampleIoStatData = `254:1 rbytes=6901432320 wbytes=14245535744 rios=263278 wios=248603 dbytes=0 dios=0 254:0 rbytes=2702336 wbytes=0 rios=97 wios=0 dbytes=0 dios=0 259:0 rbytes=6911345664 wbytes=14245536256 rios=264538 wios=244914 dbytes=530485248 dios=2` var exampleIoStatsParsed = cgroups.BlkioStats{ IoServiceBytesRecursive: []cgroups.BlkioStatEntry{ {Major: 254, Minor: 1, Value: 6901432320, Op: "Read"}, {Major: 254, Minor: 1, Value: 14245535744, Op: "Write"}, {Major: 254, Minor: 0, Value: 2702336, Op: "Read"}, {Major: 254, Minor: 0, Value: 0, Op: "Write"}, {Major: 259, Minor: 0, Value: 6911345664, Op: "Read"}, {Major: 259, Minor: 0, Value: 14245536256, Op: "Write"}, }, IoServicedRecursive: []cgroups.BlkioStatEntry{ {Major: 254, Minor: 1, Value: 263278, Op: "Read"}, {Major: 254, Minor: 1, Value: 248603, Op: "Write"}, {Major: 254, Minor: 0, Value: 97, Op: "Read"}, {Major: 254, Minor: 0, Value: 0, Op: "Write"}, {Major: 259, Minor: 0, Value: 264538, Op: "Read"}, {Major: 259, Minor: 0, Value: 244914, Op: "Write"}, }, } func lessBlkioStatEntry(a, b cgroups.BlkioStatEntry) bool { if a.Major != b.Major { return a.Major < b.Major } if a.Minor != b.Minor { return a.Minor < b.Minor } if a.Op != b.Op { return a.Op < b.Op } return a.Value < b.Value } func sortBlkioStats(stats *cgroups.BlkioStats) { for _, table := range []*[]cgroups.BlkioStatEntry{ &stats.IoServicedRecursive, &stats.IoServiceBytesRecursive, } { sort.SliceStable(*table, func(i, j int) bool { return lessBlkioStatEntry((*table)[i], (*table)[j]) }) } } func TestStatIo(t *testing.T) { // We're using a fake cgroupfs. cgroups.TestMode = true fakeCgroupDir := t.TempDir() statPath := filepath.Join(fakeCgroupDir, "io.stat") if err := os.WriteFile(statPath, []byte(exampleIoStatData), 0o644); err != nil { t.Fatal(err) } var gotStats cgroups.Stats if err := statIo(fakeCgroupDir, &gotStats); err != nil { t.Error(err) } // Sort the output since statIo uses a map internally. sortBlkioStats(&gotStats.BlkioStats) sortBlkioStats(&exampleIoStatsParsed) if !reflect.DeepEqual(gotStats.BlkioStats, exampleIoStatsParsed) { t.Errorf("parsed cgroupv2 io.stat doesn't match expected result: \ngot %#v\nexpected %#v\n", gotStats.BlkioStats, exampleIoStatsParsed) } } cgroups-0.0.4/fs2/memory.go000066400000000000000000000144651503527177300155520ustar00rootroot00000000000000package fs2 import ( "bufio" "errors" "math" "os" "strconv" "strings" "golang.org/x/sys/unix" "github.com/opencontainers/cgroups" "github.com/opencontainers/cgroups/fscommon" ) // numToStr converts an int64 value to a string for writing to a // cgroupv2 files with .min, .max, .low, or .high suffix. // The value of -1 is converted to "max" for cgroupv1 compatibility // (which used to write -1 to remove the limit). func numToStr(value int64) string { switch value { case 0: return "" case -1: return "max" } return strconv.FormatInt(value, 10) } func isMemorySet(r *cgroups.Resources) bool { return r.MemoryReservation != 0 || r.Memory != 0 || r.MemorySwap != 0 } func setMemory(dirPath string, r *cgroups.Resources) error { if !isMemorySet(r) { return nil } if err := CheckMemoryUsage(dirPath, r); err != nil { return err } swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory) if err != nil { return err } swapStr := numToStr(swap) if swapStr == "" && swap == 0 && r.MemorySwap > 0 { // memory and memorySwap set to the same value -- disable swap swapStr = "0" } // never write empty string to `memory.swap.max`, it means set to 0. if swapStr != "" { if err := cgroups.WriteFile(dirPath, "memory.swap.max", swapStr); err != nil { // If swap is not enabled, silently ignore setting to max or disabling it. if !(errors.Is(err, os.ErrNotExist) && (swapStr == "max" || swapStr == "0")) { //nolint:staticcheck // Ignore "QF1001: could apply De Morgan's law". return err } } } if val := numToStr(r.Memory); val != "" { if err := cgroups.WriteFile(dirPath, "memory.max", val); err != nil { return err } } // cgroup.Resources.KernelMemory is ignored if val := numToStr(r.MemoryReservation); val != "" { if err := cgroups.WriteFile(dirPath, "memory.low", val); err != nil { return err } } return nil } func statMemory(dirPath string, stats *cgroups.Stats) error { const file = "memory.stat" statsFile, err := cgroups.OpenFile(dirPath, file, os.O_RDONLY) if err != nil { return err } defer statsFile.Close() sc := bufio.NewScanner(statsFile) for sc.Scan() { t, v, err := fscommon.ParseKeyValue(sc.Text()) if err != nil { return &parseError{Path: dirPath, File: file, Err: err} } stats.MemoryStats.Stats[t] = v } if err := sc.Err(); err != nil { return &parseError{Path: dirPath, File: file, Err: err} } stats.MemoryStats.Cache = stats.MemoryStats.Stats["file"] // Unlike cgroup v1 which has memory.use_hierarchy binary knob, // cgroup v2 is always hierarchical. stats.MemoryStats.UseHierarchy = true memoryUsage, err := getMemoryDataV2(dirPath, "") if err != nil { if errors.Is(err, unix.ENOENT) && dirPath == UnifiedMountpoint { // The root cgroup does not have memory.{current,max,peak} // so emulate those using data from /proc/meminfo and // /sys/fs/cgroup/memory.stat return rootStatsFromMeminfo(stats) } return err } stats.MemoryStats.Usage = memoryUsage swapOnlyUsage, err := getMemoryDataV2(dirPath, "swap") if err != nil { return err } stats.MemoryStats.SwapOnlyUsage = swapOnlyUsage swapUsage := swapOnlyUsage // As cgroup v1 reports SwapUsage values as mem+swap combined, // while in cgroup v2 swap values do not include memory, // report combined mem+swap for v1 compatibility. swapUsage.Usage += memoryUsage.Usage if swapUsage.Limit != math.MaxUint64 { swapUsage.Limit += memoryUsage.Limit } // The `MaxUsage` of mem+swap cannot simply combine mem with // swap. So set it to 0 for v1 compatibility. swapUsage.MaxUsage = 0 stats.MemoryStats.SwapUsage = swapUsage return nil } func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) { memoryData := cgroups.MemoryData{} moduleName := "memory" if name != "" { moduleName = "memory." + name } usage := moduleName + ".current" limit := moduleName + ".max" maxUsage := moduleName + ".peak" value, err := fscommon.GetCgroupParamUint(path, usage) if err != nil { if name != "" && os.IsNotExist(err) { // Ignore EEXIST as there's no swap accounting // if kernel CONFIG_MEMCG_SWAP is not set or // swapaccount=0 kernel boot parameter is given. return cgroups.MemoryData{}, nil } return cgroups.MemoryData{}, err } memoryData.Usage = value value, err = fscommon.GetCgroupParamUint(path, limit) if err != nil { return cgroups.MemoryData{}, err } memoryData.Limit = value // `memory.peak` since kernel 5.19 // `memory.swap.peak` since kernel 6.5 value, err = fscommon.GetCgroupParamUint(path, maxUsage) if err != nil && !os.IsNotExist(err) { return cgroups.MemoryData{}, err } memoryData.MaxUsage = value return memoryData, nil } func rootStatsFromMeminfo(stats *cgroups.Stats) error { const file = "/proc/meminfo" f, err := os.Open(file) if err != nil { return err } defer f.Close() // Fields we are interested in. var ( swap_free uint64 swap_total uint64 ) mem := map[string]*uint64{ "SwapFree": &swap_free, "SwapTotal": &swap_total, } found := 0 sc := bufio.NewScanner(f) for sc.Scan() { parts := strings.SplitN(sc.Text(), ":", 3) if len(parts) != 2 { // Should not happen. continue } k := parts[0] p, ok := mem[k] if !ok { // Unknown field -- not interested. continue } vStr := strings.TrimSpace(strings.TrimSuffix(parts[1], " kB")) *p, err = strconv.ParseUint(vStr, 10, 64) if err != nil { return &parseError{File: file, Err: errors.New("bad value for " + k)} } found++ if found == len(mem) { // Got everything we need -- skip the rest. break } } if err := sc.Err(); err != nil { return &parseError{Path: "", File: file, Err: err} } // cgroup v1 `usage_in_bytes` reports memory usage as the sum of // - rss (NR_ANON_MAPPED) // - cache (NR_FILE_PAGES) // cgroup v1 reports SwapUsage values as mem+swap combined // cgroup v2 reports rss and cache as anon and file. // sum `anon` + `file` to report the same value as `usage_in_bytes` in v1. // sum swap usage as combined mem+swap usage for consistency as well. stats.MemoryStats.Usage.Usage = stats.MemoryStats.Stats["anon"] + stats.MemoryStats.Stats["file"] stats.MemoryStats.Usage.Limit = math.MaxUint64 stats.MemoryStats.SwapUsage.Usage = (swap_total - swap_free) * 1024 stats.MemoryStats.SwapUsage.Limit = math.MaxUint64 stats.MemoryStats.SwapUsage.Usage += stats.MemoryStats.Usage.Usage return nil } cgroups-0.0.4/fs2/memory_test.go000066400000000000000000000110341503527177300165760ustar00rootroot00000000000000package fs2 import ( "os" "path/filepath" "strings" "testing" "github.com/opencontainers/cgroups" ) const exampleMemoryStatData = `anon 790425600 file 6502666240 kernel_stack 7012352 pagetables 8867840 percpu 2445520 sock 40960 shmem 6721536 file_mapped 656187392 file_dirty 1122304 file_writeback 0 swapcached 10 anon_thp 438304768 file_thp 0 shmem_thp 0 inactive_anon 892223488 active_anon 2973696 inactive_file 5307346944 active_file 1179316224 unevictable 31477760 slab_reclaimable 348866240 slab_unreclaimable 10099808 slab 358966048 workingset_refault_anon 0 workingset_refault_file 0 workingset_activate_anon 0 workingset_activate_file 0 workingset_restore_anon 0 workingset_restore_file 0 workingset_nodereclaim 0 pgfault 103216687 pgmajfault 6879 pgrefill 0 pgscan 0 pgsteal 0 pgactivate 1110217 pgdeactivate 292 pglazyfree 267 pglazyfreed 0 thp_fault_alloc 57411 thp_collapse_alloc 443` func TestStatMemoryPodCgroupNotFound(t *testing.T) { // We're using a fake cgroupfs. cgroups.TestMode = true fakeCgroupDir := t.TempDir() // only write memory.stat to ensure pod cgroup usage // still reads memory.current. statPath := filepath.Join(fakeCgroupDir, "memory.stat") if err := os.WriteFile(statPath, []byte(exampleMemoryStatData), 0o644); err != nil { t.Fatal(err) } gotStats := cgroups.NewStats() // use a fake root path to mismatch the file we wrote. // this triggers the non-root path which should fail to find memory.current. err := statMemory(fakeCgroupDir, gotStats) if err == nil { t.Errorf("expected error when statting memory for cgroupv2 root, but was nil") } if !strings.Contains(err.Error(), "memory.current: no such file or directory") { t.Errorf("expected error to contain 'memory.current: no such file or directory', but was %s", err.Error()) } } func TestStatMemoryPodCgroup(t *testing.T) { // We're using a fake cgroupfs. cgroups.TestMode = true fakeCgroupDir := t.TempDir() statPath := filepath.Join(fakeCgroupDir, "memory.stat") if err := os.WriteFile(statPath, []byte(exampleMemoryStatData), 0o644); err != nil { t.Fatal(err) } if err := os.WriteFile(filepath.Join(fakeCgroupDir, "memory.current"), []byte("123456789"), 0o644); err != nil { t.Fatal(err) } if err := os.WriteFile(filepath.Join(fakeCgroupDir, "memory.max"), []byte("999999999"), 0o644); err != nil { t.Fatal(err) } if err := os.WriteFile(filepath.Join(fakeCgroupDir, "memory.peak"), []byte("987654321"), 0o644); err != nil { t.Fatal(err) } gotStats := cgroups.NewStats() // use a fake root path to trigger the pod cgroup lookup. err := statMemory(fakeCgroupDir, gotStats) if err != nil { t.Errorf("expected no error when statting memory for cgroupv2 root, but got %#+v", err) } // result should be "memory.current" var expectedUsageBytes uint64 = 123456789 if gotStats.MemoryStats.Usage.Usage != expectedUsageBytes { t.Errorf("parsed cgroupv2 memory.stat doesn't match expected result: \ngot %#v\nexpected %#v\n", gotStats.MemoryStats.Usage.Usage, expectedUsageBytes) } // result should be "memory.max" var expectedLimitBytes uint64 = 999999999 if gotStats.MemoryStats.Usage.Limit != expectedLimitBytes { t.Errorf("parsed cgroupv2 memory.stat doesn't match expected result: \ngot %#v\nexpected %#v\n", gotStats.MemoryStats.Usage.Limit, expectedLimitBytes) } // result should be "memory.peak" var expectedMaxUsageBytes uint64 = 987654321 if gotStats.MemoryStats.Usage.MaxUsage != expectedMaxUsageBytes { t.Errorf("parsed cgroupv2 memory.stat doesn't match expected result: \ngot %#v\nexpected %#v\n", gotStats.MemoryStats.Usage.MaxUsage, expectedMaxUsageBytes) } } func TestRootStatsFromMeminfo(t *testing.T) { stats := &cgroups.Stats{ MemoryStats: cgroups.MemoryStats{ Stats: map[string]uint64{ "anon": 790425600, "file": 6502666240, }, }, } if err := rootStatsFromMeminfo(stats); err != nil { t.Fatal(err) } // result is anon + file var expectedUsageBytes uint64 = 7293091840 if stats.MemoryStats.Usage.Usage != expectedUsageBytes { t.Errorf("parsed cgroupv2 memory.stat doesn't match expected result: \ngot %d\nexpected %d\n", stats.MemoryStats.Usage.Usage, expectedUsageBytes) } // swap is adjusted to mem+swap if stats.MemoryStats.SwapUsage.Usage < stats.MemoryStats.Usage.Usage { t.Errorf("swap usage %d should be at least mem usage %d", stats.MemoryStats.SwapUsage.Usage, stats.MemoryStats.Usage.Usage) } if stats.MemoryStats.SwapUsage.Limit < stats.MemoryStats.Usage.Limit { t.Errorf("swap limit %d should be at least mem limit %d", stats.MemoryStats.SwapUsage.Limit, stats.MemoryStats.Usage.Limit) } } cgroups-0.0.4/fs2/misc.go000066400000000000000000000016041503527177300151640ustar00rootroot00000000000000package fs2 import ( "bufio" "os" "strings" "github.com/opencontainers/cgroups" "github.com/opencontainers/cgroups/fscommon" ) func statMisc(dirPath string, stats *cgroups.Stats) error { for _, file := range []string{"current", "events"} { fd, err := cgroups.OpenFile(dirPath, "misc."+file, os.O_RDONLY) if err != nil { return err } s := bufio.NewScanner(fd) for s.Scan() { key, value, err := fscommon.ParseKeyValue(s.Text()) if err != nil { fd.Close() return err } key = strings.TrimSuffix(key, ".max") if _, ok := stats.MiscStats[key]; !ok { stats.MiscStats[key] = cgroups.MiscStats{} } tmp := stats.MiscStats[key] switch file { case "current": tmp.Usage = value case "events": tmp.Events = value } stats.MiscStats[key] = tmp } fd.Close() if err := s.Err(); err != nil { return err } } return nil } cgroups-0.0.4/fs2/misc_test.go000066400000000000000000000060611503527177300162250ustar00rootroot00000000000000package fs2 import ( "os" "path/filepath" "strings" "testing" "github.com/opencontainers/cgroups" ) const exampleMiscCurrentData = `res_a 123 res_b 456 res_c 42` const exampleMiscEventsData = `res_a.max 1 res_b.max 2 res_c.max 3` func TestStatMiscPodCgroupEmpty(t *testing.T) { // We're using a fake cgroupfs. cgroups.TestMode = true fakeCgroupDir := t.TempDir() // create empty misc.current and misc.events files to test the common case // where no misc resource keys are available for _, file := range []string{"misc.current", "misc.events"} { if _, err := os.Create(filepath.Join(fakeCgroupDir, file)); err != nil { t.Fatal(err) } } gotStats := cgroups.NewStats() err := statMisc(fakeCgroupDir, gotStats) if err != nil { t.Errorf("expected no error when statting empty misc.current/misc.events for cgroupv2, but got %#v", err) } if len(gotStats.MiscStats) != 0 { t.Errorf("parsed cgroupv2 misc.* returns unexpected resources: got %#v but expected nothing", gotStats.MiscStats) } } func TestStatMiscPodCgroupNotFound(t *testing.T) { // We're using a fake cgroupfs. cgroups.TestMode = true fakeCgroupDir := t.TempDir() // only write misc.current to ensure pod cgroup usage // still reads misc.events. statPath := filepath.Join(fakeCgroupDir, "misc.current") if err := os.WriteFile(statPath, []byte(exampleMiscCurrentData), 0o644); err != nil { t.Fatal(err) } gotStats := cgroups.NewStats() // use a fake root path to mismatch the file we wrote. // this triggers the non-root path which should fail to find misc.events. err := statMisc(fakeCgroupDir, gotStats) if err == nil { t.Errorf("expected error when statting misc.current for cgroupv2 root, but was nil") } if !strings.Contains(err.Error(), "misc.events: no such file or directory") { t.Errorf("expected error to contain 'misc.events: no such file or directory', but was %s", err.Error()) } } func TestStatMiscPodCgroup(t *testing.T) { // We're using a fake cgroupfs. cgroups.TestMode = true fakeCgroupDir := t.TempDir() currentPath := filepath.Join(fakeCgroupDir, "misc.current") if err := os.WriteFile(currentPath, []byte(exampleMiscCurrentData), 0o644); err != nil { t.Fatal(err) } eventsPath := filepath.Join(fakeCgroupDir, "misc.events") if err := os.WriteFile(eventsPath, []byte(exampleMiscEventsData), 0o644); err != nil { t.Fatal(err) } gotStats := cgroups.NewStats() // use a fake root path to trigger the pod cgroup lookup. err := statMisc(fakeCgroupDir, gotStats) if err != nil { t.Errorf("expected no error when statting misc for cgroupv2 root, but got %#+v", err) } // make sure all res_* from exampleMisc*Data are returned if len(gotStats.MiscStats) != 3 { t.Errorf("parsed cgroupv2 misc doesn't return all expected resources: \ngot %#v\nexpected %#v\n", len(gotStats.MiscStats), 3) } var expectedUsageBytes uint64 = 42 if gotStats.MiscStats["res_c"].Usage != expectedUsageBytes { t.Errorf("parsed cgroupv2 misc.current for res_c doesn't match expected result: \ngot %#v\nexpected %#v\n", gotStats.MiscStats["res_c"].Usage, expectedUsageBytes) } } cgroups-0.0.4/fs2/pids.go000066400000000000000000000031651503527177300151740ustar00rootroot00000000000000package fs2 import ( "errors" "math" "os" "strings" "golang.org/x/sys/unix" "github.com/opencontainers/cgroups" "github.com/opencontainers/cgroups/fscommon" ) func isPidsSet(r *cgroups.Resources) bool { return r.PidsLimit != 0 } func setPids(dirPath string, r *cgroups.Resources) error { if !isPidsSet(r) { return nil } if val := numToStr(r.PidsLimit); val != "" { if err := cgroups.WriteFile(dirPath, "pids.max", val); err != nil { return err } } return nil } func statPidsFromCgroupProcs(dirPath string, stats *cgroups.Stats) error { // if the controller is not enabled, let's read PIDS from cgroups.procs // (or threads if cgroup.threads is enabled) contents, err := cgroups.ReadFile(dirPath, "cgroup.procs") if errors.Is(err, unix.ENOTSUP) { contents, err = cgroups.ReadFile(dirPath, "cgroup.threads") } if err != nil { return err } pids := strings.Count(contents, "\n") stats.PidsStats.Current = uint64(pids) stats.PidsStats.Limit = 0 return nil } func statPids(dirPath string, stats *cgroups.Stats) error { current, err := fscommon.GetCgroupParamUint(dirPath, "pids.current") if err != nil { if os.IsNotExist(err) { return statPidsFromCgroupProcs(dirPath, stats) } return err } max, err := fscommon.GetCgroupParamUint(dirPath, "pids.max") if err != nil { return err } // If no limit is set, read from pids.max returns "max", which is // converted to MaxUint64 by GetCgroupParamUint. Historically, we // represent "no limit" for pids as 0, thus this conversion. if max == math.MaxUint64 { max = 0 } stats.PidsStats.Current = current stats.PidsStats.Limit = max return nil } cgroups-0.0.4/fs2/psi.go000066400000000000000000000036211503527177300150250ustar00rootroot00000000000000package fs2 import ( "bufio" "errors" "fmt" "os" "strconv" "strings" "golang.org/x/sys/unix" "github.com/opencontainers/cgroups" ) func statPSI(dirPath string, file string) (*cgroups.PSIStats, error) { f, err := cgroups.OpenFile(dirPath, file, os.O_RDONLY) if err != nil { if errors.Is(err, os.ErrNotExist) { // Kernel < 4.20, or CONFIG_PSI is not set, // or PSI stats are turned off for the cgroup // ("echo 0 > cgroup.pressure", kernel >= 6.1). return nil, nil } return nil, err } defer f.Close() var psistats cgroups.PSIStats sc := bufio.NewScanner(f) for sc.Scan() { parts := strings.Fields(sc.Text()) var pv *cgroups.PSIData switch parts[0] { case "some": pv = &psistats.Some case "full": pv = &psistats.Full } if pv != nil { *pv, err = parsePSIData(parts[1:]) if err != nil { return nil, &parseError{Path: dirPath, File: file, Err: err} } } } if err := sc.Err(); err != nil { if errors.Is(err, unix.ENOTSUP) { // Some kernels (e.g. CS9) may return ENOTSUP on read // if psi=1 kernel cmdline parameter is required. return nil, nil } return nil, &parseError{Path: dirPath, File: file, Err: err} } return &psistats, nil } func parsePSIData(psi []string) (cgroups.PSIData, error) { data := cgroups.PSIData{} for _, f := range psi { key, val, ok := strings.Cut(f, "=") if !ok { return data, fmt.Errorf("invalid psi data: %q", f) } var pv *float64 switch key { case "avg10": pv = &data.Avg10 case "avg60": pv = &data.Avg60 case "avg300": pv = &data.Avg300 case "total": v, err := strconv.ParseUint(val, 10, 64) if err != nil { return data, fmt.Errorf("invalid %s PSI value: %w", key, err) } data.Total = v } if pv != nil { v, err := strconv.ParseFloat(val, 64) if err != nil { return data, fmt.Errorf("invalid %s PSI value: %w", key, err) } *pv = v } } return data, nil } cgroups-0.0.4/fs2/psi_test.go000066400000000000000000000016321503527177300160640ustar00rootroot00000000000000package fs2 import ( "os" "path/filepath" "reflect" "testing" "github.com/opencontainers/cgroups" ) func TestStatCPUPSI(t *testing.T) { const examplePSIData = `some avg10=1.71 avg60=2.36 avg300=2.57 total=230548833 full avg10=1.00 avg60=1.01 avg300=1.00 total=157622356` // We're using a fake cgroupfs. cgroups.TestMode = true fakeCgroupDir := t.TempDir() statPath := filepath.Join(fakeCgroupDir, "cpu.pressure") if err := os.WriteFile(statPath, []byte(examplePSIData), 0o644); err != nil { t.Fatal(err) } st, err := statPSI(fakeCgroupDir, "cpu.pressure") if err != nil { t.Fatal(err) } if !reflect.DeepEqual(*st, cgroups.PSIStats{ Some: cgroups.PSIData{ Avg10: 1.71, Avg60: 2.36, Avg300: 2.57, Total: 230548833, }, Full: cgroups.PSIData{ Avg10: 1.00, Avg60: 1.01, Avg300: 1.00, Total: 157622356, }, }) { t.Errorf("unexpected PSI result: %+v", st) } } cgroups-0.0.4/fscommon/000077500000000000000000000000001503527177300150305ustar00rootroot00000000000000cgroups-0.0.4/fscommon/rdma.go000066400000000000000000000052001503527177300162770ustar00rootroot00000000000000package fscommon import ( "bufio" "errors" "math" "os" "strconv" "strings" "golang.org/x/sys/unix" "github.com/opencontainers/cgroups" ) // parseRdmaKV parses raw string to RdmaEntry. func parseRdmaKV(raw string, entry *cgroups.RdmaEntry) error { var value uint32 k, v, ok := strings.Cut(raw, "=") if !ok { return errors.New("Unable to parse RDMA entry") } if v == "max" { value = math.MaxUint32 } else { val64, err := strconv.ParseUint(v, 10, 32) if err != nil { return err } value = uint32(val64) } switch k { case "hca_handle": entry.HcaHandles = value case "hca_object": entry.HcaObjects = value } return nil } // readRdmaEntries reads and converts array of rawstrings to RdmaEntries from file. // example entry: mlx4_0 hca_handle=2 hca_object=2000 func readRdmaEntries(dir, file string) ([]cgroups.RdmaEntry, error) { rdmaEntries := make([]cgroups.RdmaEntry, 0) fd, err := cgroups.OpenFile(dir, file, unix.O_RDONLY) if err != nil { return nil, err } defer fd.Close() scanner := bufio.NewScanner(fd) for scanner.Scan() { parts := strings.SplitN(scanner.Text(), " ", 4) if len(parts) == 3 { entry := new(cgroups.RdmaEntry) entry.Device = parts[0] err = parseRdmaKV(parts[1], entry) if err != nil { continue } err = parseRdmaKV(parts[2], entry) if err != nil { continue } rdmaEntries = append(rdmaEntries, *entry) } } return rdmaEntries, scanner.Err() } // RdmaGetStats returns rdma stats such as totalLimit and current entries. func RdmaGetStats(path string, stats *cgroups.Stats) error { currentEntries, err := readRdmaEntries(path, "rdma.current") if err != nil { if errors.Is(err, os.ErrNotExist) { err = nil } return err } maxEntries, err := readRdmaEntries(path, "rdma.max") if err != nil { return err } // If device got removed between reading two files, ignore returning stats. if len(currentEntries) != len(maxEntries) { return nil } stats.RdmaStats = cgroups.RdmaStats{ RdmaLimit: maxEntries, RdmaCurrent: currentEntries, } return nil } func createCmdString(device string, limits cgroups.LinuxRdma) string { cmdString := device if limits.HcaHandles != nil { cmdString += " hca_handle=" + strconv.FormatUint(uint64(*limits.HcaHandles), 10) } if limits.HcaObjects != nil { cmdString += " hca_object=" + strconv.FormatUint(uint64(*limits.HcaObjects), 10) } return cmdString } // RdmaSet sets RDMA resources. func RdmaSet(path string, r *cgroups.Resources) error { for device, limits := range r.Rdma { if err := cgroups.WriteFile(path, "rdma.max", createCmdString(device, limits)); err != nil { return err } } return nil } cgroups-0.0.4/fscommon/rdma_test.go000066400000000000000000000027501503527177300173450ustar00rootroot00000000000000package fscommon import ( "os" "path/filepath" "testing" "github.com/opencontainers/cgroups" ) /* Roadmap for future */ // (Low-priority) TODO: Check if it is possible to virtually mimic an actual RDMA device. // TODO: Think of more edge-cases to add. // TestRdmaSet performs an E2E test of RdmaSet(), parseRdmaKV() using dummy device and a dummy cgroup file-system. // Note: Following test does not guarantees that your host supports RDMA since this mocks underlying infrastructure. func TestRdmaSet(t *testing.T) { testCgroupPath := filepath.Join(t.TempDir(), "rdma") // Ensure the full mock cgroup path exists. err := os.Mkdir(testCgroupPath, 0o755) if err != nil { t.Fatal(err) } rdmaDevice := "mlx5_1" maxHandles := uint32(100) maxObjects := uint32(300) rdmaStubResource := &cgroups.Resources{ Rdma: map[string]cgroups.LinuxRdma{ rdmaDevice: { HcaHandles: &maxHandles, HcaObjects: &maxObjects, }, }, } if err := RdmaSet(testCgroupPath, rdmaStubResource); err != nil { t.Fatal(err) } // The default rdma.max must be written. rdmaEntries, err := readRdmaEntries(testCgroupPath, "rdma.max") if err != nil { t.Fatal(err) } if len(rdmaEntries) != 1 { t.Fatal("rdma_test: Got the wrong values while parsing entries from rdma.max") } if rdmaEntries[0].HcaHandles != maxHandles { t.Fatalf("rdma_test: Got the wrong value for hca_handles") } if rdmaEntries[0].HcaObjects != maxObjects { t.Fatalf("rdma_test: Got the wrong value for hca_Objects") } } cgroups-0.0.4/fscommon/utils.go000066400000000000000000000073441503527177300165270ustar00rootroot00000000000000package fscommon import ( "errors" "fmt" "math" "path" "strconv" "strings" "github.com/opencontainers/cgroups" ) var ( // Deprecated: use cgroups.OpenFile instead. OpenFile = cgroups.OpenFile // Deprecated: use cgroups.ReadFile instead. ReadFile = cgroups.ReadFile // Deprecated: use cgroups.WriteFile instead. WriteFile = cgroups.WriteFile ) // ParseError records a parse error details, including the file path. type ParseError struct { Path string File string Err error } func (e *ParseError) Error() string { return "unable to parse " + path.Join(e.Path, e.File) + ": " + e.Err.Error() } func (e *ParseError) Unwrap() error { return e.Err } // ParseUint converts a string to an uint64 integer. // Negative values are returned at zero as, due to kernel bugs, // some of the memory cgroup stats can be negative. func ParseUint(s string, base, bitSize int) (uint64, error) { value, err := strconv.ParseUint(s, base, bitSize) if err != nil { intValue, intErr := strconv.ParseInt(s, base, bitSize) // 1. Handle negative values greater than MinInt64 (and) // 2. Handle negative values lesser than MinInt64 if intErr == nil && intValue < 0 { return 0, nil } else if errors.Is(intErr, strconv.ErrRange) && intValue < 0 { return 0, nil } return value, err } return value, nil } // ParseKeyValue parses a space-separated "key value" kind of cgroup // parameter and returns its key as a string, and its value as uint64 // (using [ParseUint] to convert the value). For example, // "io_service_bytes 1234" will be returned as "io_service_bytes", 1234. func ParseKeyValue(t string) (string, uint64, error) { key, val, ok := strings.Cut(t, " ") if !ok || key == "" || val == "" { return "", 0, fmt.Errorf(`line %q is not in "key value" format`, t) } value, err := ParseUint(val, 10, 64) if err != nil { return "", 0, err } return key, value, nil } // GetValueByKey reads space-separated "key value" pairs from the specified // cgroup file, looking for a specified key, and returns its value as uint64, // using [ParseUint] for conversion. If the value is not found, 0 is returned. func GetValueByKey(path, file, key string) (uint64, error) { content, err := cgroups.ReadFile(path, file) if err != nil { return 0, err } key += " " lines := strings.Split(content, "\n") for _, line := range lines { v, ok := strings.CutPrefix(line, key) if ok { val, err := ParseUint(v, 10, 64) if err != nil { err = &ParseError{Path: path, File: file, Err: err} } return val, err } } return 0, nil } // GetCgroupParamUint reads a single uint64 value from the specified cgroup file. // If the value read is "max", the math.MaxUint64 is returned. func GetCgroupParamUint(path, file string) (uint64, error) { contents, err := GetCgroupParamString(path, file) if err != nil { return 0, err } if contents == "max" { return math.MaxUint64, nil } res, err := ParseUint(contents, 10, 64) if err != nil { return res, &ParseError{Path: path, File: file, Err: err} } return res, nil } // GetCgroupParamInt reads a single int64 value from specified cgroup file. // If the value read is "max", the math.MaxInt64 is returned. func GetCgroupParamInt(path, file string) (int64, error) { contents, err := GetCgroupParamString(path, file) if err != nil { return 0, err } if contents == "max" { return math.MaxInt64, nil } res, err := strconv.ParseInt(contents, 10, 64) if err != nil { return res, &ParseError{Path: path, File: file, Err: err} } return res, nil } // GetCgroupParamString reads a string from the specified cgroup file. func GetCgroupParamString(path, file string) (string, error) { contents, err := cgroups.ReadFile(path, file) if err != nil { return "", err } return strings.TrimSpace(contents), nil } cgroups-0.0.4/fscommon/utils_test.go000066400000000000000000000037541503527177300175670ustar00rootroot00000000000000package fscommon import ( "math" "os" "path/filepath" "strconv" "testing" "github.com/opencontainers/cgroups" ) const ( cgroupFile = "cgroup.file" floatValue = 2048.0 floatString = "2048" ) func init() { cgroups.TestMode = true } func TestGetCgroupParamsInt(t *testing.T) { // Setup tempdir. tempDir := t.TempDir() tempFile := filepath.Join(tempDir, cgroupFile) // Success. if err := os.WriteFile(tempFile, []byte(floatString), 0o755); err != nil { t.Fatal(err) } value, err := GetCgroupParamUint(tempDir, cgroupFile) if err != nil { t.Fatal(err) } else if value != floatValue { t.Fatalf("Expected %d to equal %f", value, floatValue) } // Success with new line. err = os.WriteFile(tempFile, []byte(floatString+"\n"), 0o755) if err != nil { t.Fatal(err) } value, err = GetCgroupParamUint(tempDir, cgroupFile) if err != nil { t.Fatal(err) } else if value != floatValue { t.Fatalf("Expected %d to equal %f", value, floatValue) } // Success with negative values err = os.WriteFile(tempFile, []byte("-12345"), 0o755) if err != nil { t.Fatal(err) } value, err = GetCgroupParamUint(tempDir, cgroupFile) if err != nil { t.Fatal(err) } else if value != 0 { t.Fatalf("Expected %d to equal %d", value, 0) } // Success with negative values lesser than min int64 s := strconv.FormatFloat(math.MinInt64, 'f', -1, 64) err = os.WriteFile(tempFile, []byte(s), 0o755) if err != nil { t.Fatal(err) } value, err = GetCgroupParamUint(tempDir, cgroupFile) if err != nil { t.Fatal(err) } else if value != 0 { t.Fatalf("Expected %d to equal %d", value, 0) } // Not a float. err = os.WriteFile(tempFile, []byte("not-a-float"), 0o755) if err != nil { t.Fatal(err) } _, err = GetCgroupParamUint(tempDir, cgroupFile) if err == nil { t.Fatal("Expecting error, got none") } // Unknown file. err = os.Remove(tempFile) if err != nil { t.Fatal(err) } _, err = GetCgroupParamUint(tempDir, cgroupFile) if err == nil { t.Fatal("Expecting error, got none") } } cgroups-0.0.4/getallpids.go000066400000000000000000000007711503527177300156730ustar00rootroot00000000000000package cgroups import ( "io/fs" "path/filepath" ) // GetAllPids returns all pids from the cgroup identified by path, and all its // sub-cgroups. func GetAllPids(path string) ([]int, error) { var pids []int err := filepath.WalkDir(path, func(p string, d fs.DirEntry, iErr error) error { if iErr != nil { return iErr } if !d.IsDir() { return nil } cPids, err := readProcsFile(p) if err != nil { return err } pids = append(pids, cPids...) return nil }) return pids, err } cgroups-0.0.4/getallpids_test.go000066400000000000000000000004121503527177300167220ustar00rootroot00000000000000package cgroups import ( "testing" ) func BenchmarkGetAllPids(b *testing.B) { total := 0 for i := 0; i < b.N; i++ { i, err := GetAllPids("/sys/fs/cgroup") if err != nil { b.Fatal(err) } total += len(i) } b.Logf("iter: %d, total: %d", b.N, total) } cgroups-0.0.4/go.mod000066400000000000000000000005411503527177300143150ustar00rootroot00000000000000module github.com/opencontainers/cgroups go 1.23.0 require ( github.com/cilium/ebpf v0.17.3 github.com/coreos/go-systemd/v22 v22.5.0 github.com/cyphar/filepath-securejoin v0.4.1 github.com/godbus/dbus/v5 v5.1.0 github.com/moby/sys/mountinfo v0.7.2 github.com/moby/sys/userns v0.1.0 github.com/sirupsen/logrus v1.9.3 golang.org/x/sys v0.30.0 ) cgroups-0.0.4/go.sum000066400000000000000000000107451503527177300143510ustar00rootroot00000000000000github.com/cilium/ebpf v0.17.3 h1:FnP4r16PWYSE4ux6zN+//jMcW4nMVRvuTLVTvCjyyjg= github.com/cilium/ebpf v0.17.3/go.mod h1:G5EDHij8yiLzaqn0WjyfJHvRa+3aDlReIaLVRMvOyJk= github.com/coreos/go-systemd/v22 v22.5.0 h1:RrqgGjYQKalulkV8NGVIfkXQf6YYmOyiJKk8iXXhfZs= github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= github.com/cyphar/filepath-securejoin v0.4.1 h1:JyxxyPEaktOD+GAnqIqTf9A8tHyAG22rowi7HkoSU1s= github.com/cyphar/filepath-securejoin v0.4.1/go.mod h1:Sdj7gXlvMcPZsbhwhQ33GguGLDGQL7h7bg04C/+u9jI= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/go-quicktest/qt v1.101.0 h1:O1K29Txy5P2OK0dGo59b7b0LR6wKfIhttaAhHUyn7eI= github.com/go-quicktest/qt v1.101.0/go.mod h1:14Bz/f7NwaXPtdYEgzsx46kqSxVwTbzVZsDC26tQJow= github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/godbus/dbus/v5 v5.1.0 h1:4KLkAxT3aOY8Li4FRJe/KvhoNFFxo0m6fNuFUO8QJUk= github.com/godbus/dbus/v5 v5.1.0/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/josharian/native v1.1.0 h1:uuaP0hAbW7Y4l0ZRQ6C9zfb7Mg1mbFKry/xzDAfmtLA= github.com/josharian/native v1.1.0/go.mod h1:7X/raswPFr05uY3HiLlYeyQntB6OO7E/d2Cu7qoaN2w= github.com/jsimonetti/rtnetlink/v2 v2.0.1 h1:xda7qaHDSVOsADNouv7ukSuicKZO7GgVUCXxpaIEIlM= github.com/jsimonetti/rtnetlink/v2 v2.0.1/go.mod h1:7MoNYNbb3UaDHtF8udiJo/RH6VsTKP1pqKLUTVCvToE= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/mdlayher/netlink v1.7.2 h1:/UtM3ofJap7Vl4QWCPDGXY8d3GIY2UGSDbK+QWmY8/g= github.com/mdlayher/netlink v1.7.2/go.mod h1:xraEF7uJbxLhc5fpHL4cPe221LI2bdttWlU+ZGLfQSw= github.com/mdlayher/socket v0.4.1 h1:eM9y2/jlbs1M615oshPQOHZzj6R6wMT7bX5NPiQvn2U= github.com/mdlayher/socket v0.4.1/go.mod h1:cAqeGjoufqdxWkD7DkpyS+wcefOtmu5OQ8KuoJGIReA= github.com/moby/sys/mountinfo v0.7.2 h1:1shs6aH5s4o5H2zQLn796ADW1wMrIwHsyJ2v9KouLrg= github.com/moby/sys/mountinfo v0.7.2/go.mod h1:1YOa8w8Ih7uW0wALDUgT1dTTSBrZ+HiBLGws92L2RU4= github.com/moby/sys/userns v0.1.0 h1:tVLXkFOxVu9A64/yh59slHVv9ahO9UIev4JZusOLG/g= github.com/moby/sys/userns v0.1.0/go.mod h1:IHUYgu/kao6N8YZlp9Cf444ySSvCmDlmzUcYfDHOl28= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.1 h1:5TQK59W5E3v0r2duFAb7P95B6hEeOyEnHRa8MjYSMTY= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I= golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= golang.org/x/sync v0.1.0 h1:wsuoTGHzEhffawBOhz5CYhcrV4IdKZbEyZjBMuTp12o= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc= golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= cgroups-0.0.4/internal/000077500000000000000000000000001503527177300150235ustar00rootroot00000000000000cgroups-0.0.4/internal/path/000077500000000000000000000000001503527177300157575ustar00rootroot00000000000000cgroups-0.0.4/internal/path/path.go000066400000000000000000000030051503527177300172400ustar00rootroot00000000000000package path import ( "errors" "os" "path/filepath" "github.com/opencontainers/cgroups" ) // Inner returns a path to cgroup relative to a cgroup mount point, based // on cgroup configuration, or an error, if cgroup configuration is invalid. // To be used only by fs cgroup managers (systemd has different path rules). func Inner(c *cgroups.Cgroup) (string, error) { if (c.Name != "" || c.Parent != "") && c.Path != "" { return "", errors.New("cgroup: either Path or Name and Parent should be used") } // XXX: Do not remove cleanPath. Path safety is important! -- cyphar innerPath := cleanPath(c.Path) if innerPath == "" { cgParent := cleanPath(c.Parent) cgName := cleanPath(c.Name) innerPath = filepath.Join(cgParent, cgName) } return innerPath, nil } // cleanPath is a copy of github.com/opencontainers/runc/libcontainer/utils.CleanPath. func cleanPath(path string) string { // Deal with empty strings nicely. if path == "" { return "" } // Ensure that all paths are cleaned (especially problematic ones like // "/../../../../../" which can cause lots of issues). if filepath.IsAbs(path) { return filepath.Clean(path) } // If the path isn't absolute, we need to do more processing to fix paths // such as "../../../..//some/path". We also shouldn't convert absolute // paths to relative ones. path = filepath.Clean(string(os.PathSeparator) + path) // This can't fail, as (by definition) all paths are relative to root. path, _ = filepath.Rel(string(os.PathSeparator), path) return path } cgroups-0.0.4/manager/000077500000000000000000000000001503527177300146215ustar00rootroot00000000000000cgroups-0.0.4/manager/manager_test.go000066400000000000000000000026131503527177300176230ustar00rootroot00000000000000package manager import ( "testing" "github.com/opencontainers/cgroups" "github.com/opencontainers/cgroups/systemd" ) // TestNilResources checks that a cgroup manager do not panic when // config.Resources is nil. While it does not make sense to use a // manager with no resources, it should not result in a panic. // // This tests either v1 or v2 fs cgroup manager, depending on which // cgroup version is available. func TestNilResources(t *testing.T) { testNilResources(t, false) } // TestNilResourcesSystemd is the same as TestNilResources, // only checking the systemd cgroup manager. func TestNilResourcesSystemd(t *testing.T) { if !systemd.IsRunningSystemd() { t.Skip("requires systemd") } testNilResources(t, true) } func testNilResources(t *testing.T, systemd bool) { cg := &cgroups.Cgroup{} // .Resources is nil cg.Systemd = systemd mgr, err := New(cg) if err != nil { // Some managers require non-nil Resources during // instantiation -- provide and retry. In such case // we're mostly testing Set(nil) below. cg.Resources = &cgroups.Resources{} mgr, err = New(cg) if err != nil { t.Fatal(err) } } _ = mgr.Apply(-1) _ = mgr.Set(nil) _ = mgr.Freeze(cgroups.Thawed) _ = mgr.Exists() _, _ = mgr.GetAllPids() _, _ = mgr.GetCgroups() _, _ = mgr.GetFreezerState() _ = mgr.Path("") _ = mgr.GetPaths() _, _ = mgr.GetStats() _, _ = mgr.OOMKillCount() _ = mgr.Destroy() } cgroups-0.0.4/manager/new.go000066400000000000000000000045551503527177300157520ustar00rootroot00000000000000package manager import ( "errors" "fmt" "path/filepath" "github.com/opencontainers/cgroups" "github.com/opencontainers/cgroups/fs" "github.com/opencontainers/cgroups/fs2" "github.com/opencontainers/cgroups/systemd" ) // New returns the instance of a cgroup manager, which is chosen // based on the local environment (whether cgroup v1 or v2 is used) // and the config (whether config.Systemd is set or not). func New(config *cgroups.Cgroup) (cgroups.Manager, error) { return NewWithPaths(config, nil) } // NewWithPaths is similar to New, and can be used in case cgroup paths // are already well known, which can save some resources. // // For cgroup v1, the keys are controller/subsystem name, and the values // are absolute filesystem paths to the appropriate cgroups. // // For cgroup v2, the only key allowed is "" (empty string), and the value // is the unified cgroup path. func NewWithPaths(config *cgroups.Cgroup, paths map[string]string) (cgroups.Manager, error) { if config == nil { return nil, errors.New("cgroups/manager.New: config must not be nil") } if config.Systemd && !systemd.IsRunningSystemd() { return nil, errors.New("systemd not running on this host, cannot use systemd cgroups manager") } // Cgroup v2 aka unified hierarchy. if cgroups.IsCgroup2UnifiedMode() { path, err := getUnifiedPath(paths) if err != nil { return nil, fmt.Errorf("manager.NewWithPaths: inconsistent paths: %w", err) } if config.Systemd { return systemd.NewUnifiedManager(config, path) } return fs2.NewManager(config, path) } // Cgroup v1. if config.Systemd { return systemd.NewLegacyManager(config, paths) } return fs.NewManager(config, paths) } // getUnifiedPath is an implementation detail of libcontainer. // Historically, libcontainer.Create saves cgroup paths as per-subsystem path // map (as returned by cm.GetPaths(""), but with v2 we only have one single // unified path (with "" as a key). // // This function converts from that map to string (using "" as a key), // and also checks that the map itself is sane. func getUnifiedPath(paths map[string]string) (string, error) { if len(paths) > 1 { return "", fmt.Errorf("expected a single path, got %+v", paths) } path := paths[""] // can be empty if path != "" { if filepath.Clean(path) != path || !filepath.IsAbs(path) { return "", fmt.Errorf("invalid path: %q", path) } } return path, nil } cgroups-0.0.4/stats.go000066400000000000000000000165561503527177300147110ustar00rootroot00000000000000package cgroups type ThrottlingData struct { // Number of periods with throttling active Periods uint64 `json:"periods,omitempty"` // Number of periods when the container hit its throttling limit. ThrottledPeriods uint64 `json:"throttled_periods,omitempty"` // Aggregate time the container was throttled for in nanoseconds. ThrottledTime uint64 `json:"throttled_time,omitempty"` } type BurstData struct { // Number of periods bandwidth burst occurs BurstsPeriods uint64 `json:"bursts_periods,omitempty"` // Cumulative wall-time that any cpus has used above quota in respective periods // Units: nanoseconds. BurstTime uint64 `json:"burst_time,omitempty"` } // CpuUsage denotes the usage of a CPU. // All CPU stats are aggregate since container inception. type CpuUsage struct { // Total CPU time consumed. // Units: nanoseconds. TotalUsage uint64 `json:"total_usage,omitempty"` // Total CPU time consumed per core. // Units: nanoseconds. PercpuUsage []uint64 `json:"percpu_usage,omitempty"` // CPU time consumed per core in kernel mode // Units: nanoseconds. PercpuUsageInKernelmode []uint64 `json:"percpu_usage_in_kernelmode"` // CPU time consumed per core in user mode // Units: nanoseconds. PercpuUsageInUsermode []uint64 `json:"percpu_usage_in_usermode"` // Time spent by tasks of the cgroup in kernel mode. // Units: nanoseconds. UsageInKernelmode uint64 `json:"usage_in_kernelmode"` // Time spent by tasks of the cgroup in user mode. // Units: nanoseconds. UsageInUsermode uint64 `json:"usage_in_usermode"` } type PSIData struct { Avg10 float64 `json:"avg10"` Avg60 float64 `json:"avg60"` Avg300 float64 `json:"avg300"` Total uint64 `json:"total"` } type PSIStats struct { Some PSIData `json:"some,omitempty"` Full PSIData `json:"full,omitempty"` } type CpuStats struct { CpuUsage CpuUsage `json:"cpu_usage,omitempty"` ThrottlingData ThrottlingData `json:"throttling_data,omitempty"` PSI *PSIStats `json:"psi,omitempty"` BurstData BurstData `json:"burst_data,omitempty"` } type CPUSetStats struct { // List of the physical numbers of the CPUs on which processes // in that cpuset are allowed to execute CPUs []uint16 `json:"cpus,omitempty"` // cpu_exclusive flag CPUExclusive uint64 `json:"cpu_exclusive"` // List of memory nodes on which processes in that cpuset // are allowed to allocate memory Mems []uint16 `json:"mems,omitempty"` // mem_hardwall flag MemHardwall uint64 `json:"mem_hardwall"` // mem_exclusive flag MemExclusive uint64 `json:"mem_exclusive"` // memory_migrate flag MemoryMigrate uint64 `json:"memory_migrate"` // memory_spread page flag MemorySpreadPage uint64 `json:"memory_spread_page"` // memory_spread slab flag MemorySpreadSlab uint64 `json:"memory_spread_slab"` // memory_pressure MemoryPressure uint64 `json:"memory_pressure"` // sched_load balance flag SchedLoadBalance uint64 `json:"sched_load_balance"` // sched_relax_domain_level SchedRelaxDomainLevel int64 `json:"sched_relax_domain_level"` } type MemoryData struct { Usage uint64 `json:"usage,omitempty"` MaxUsage uint64 `json:"max_usage,omitempty"` Failcnt uint64 `json:"failcnt"` Limit uint64 `json:"limit"` } type MemoryStats struct { // memory used for cache Cache uint64 `json:"cache,omitempty"` // usage of memory Usage MemoryData `json:"usage,omitempty"` // usage of memory + swap SwapUsage MemoryData `json:"swap_usage,omitempty"` // usage of swap only SwapOnlyUsage MemoryData `json:"swap_only_usage,omitempty"` // usage of kernel memory KernelUsage MemoryData `json:"kernel_usage,omitempty"` // usage of kernel TCP memory KernelTCPUsage MemoryData `json:"kernel_tcp_usage,omitempty"` // usage of memory pages by NUMA node // see chapter 5.6 of memory controller documentation PageUsageByNUMA PageUsageByNUMA `json:"page_usage_by_numa,omitempty"` // if true, memory usage is accounted for throughout a hierarchy of cgroups. UseHierarchy bool `json:"use_hierarchy"` Stats map[string]uint64 `json:"stats,omitempty"` PSI *PSIStats `json:"psi,omitempty"` } type PageUsageByNUMA struct { // Embedding is used as types can't be recursive. PageUsageByNUMAInner Hierarchical PageUsageByNUMAInner `json:"hierarchical,omitempty"` } type PageUsageByNUMAInner struct { Total PageStats `json:"total,omitempty"` File PageStats `json:"file,omitempty"` Anon PageStats `json:"anon,omitempty"` Unevictable PageStats `json:"unevictable,omitempty"` } type PageStats struct { Total uint64 `json:"total,omitempty"` Nodes map[uint8]uint64 `json:"nodes,omitempty"` } type PidsStats struct { // number of pids in the cgroup Current uint64 `json:"current,omitempty"` // active pids hard limit Limit uint64 `json:"limit,omitempty"` } type BlkioStatEntry struct { Major uint64 `json:"major,omitempty"` Minor uint64 `json:"minor,omitempty"` Op string `json:"op,omitempty"` Value uint64 `json:"value,omitempty"` } type BlkioStats struct { // number of bytes transferred to and from the block device IoServiceBytesRecursive []BlkioStatEntry `json:"io_service_bytes_recursive,omitempty"` IoServicedRecursive []BlkioStatEntry `json:"io_serviced_recursive,omitempty"` IoQueuedRecursive []BlkioStatEntry `json:"io_queue_recursive,omitempty"` IoServiceTimeRecursive []BlkioStatEntry `json:"io_service_time_recursive,omitempty"` IoWaitTimeRecursive []BlkioStatEntry `json:"io_wait_time_recursive,omitempty"` IoMergedRecursive []BlkioStatEntry `json:"io_merged_recursive,omitempty"` IoTimeRecursive []BlkioStatEntry `json:"io_time_recursive,omitempty"` SectorsRecursive []BlkioStatEntry `json:"sectors_recursive,omitempty"` PSI *PSIStats `json:"psi,omitempty"` } type HugetlbStats struct { // current res_counter usage for hugetlb Usage uint64 `json:"usage,omitempty"` // maximum usage ever recorded. MaxUsage uint64 `json:"max_usage,omitempty"` // number of times hugetlb usage allocation failure. Failcnt uint64 `json:"failcnt"` } type RdmaEntry struct { Device string `json:"device,omitempty"` HcaHandles uint32 `json:"hca_handles,omitempty"` HcaObjects uint32 `json:"hca_objects,omitempty"` } type RdmaStats struct { RdmaLimit []RdmaEntry `json:"rdma_limit,omitempty"` RdmaCurrent []RdmaEntry `json:"rdma_current,omitempty"` } type MiscStats struct { // current resource usage for a key in misc Usage uint64 `json:"usage,omitempty"` // number of times the resource usage was about to go over the max boundary Events uint64 `json:"events,omitempty"` } type Stats struct { CpuStats CpuStats `json:"cpu_stats,omitempty"` CPUSetStats CPUSetStats `json:"cpuset_stats,omitempty"` MemoryStats MemoryStats `json:"memory_stats,omitempty"` PidsStats PidsStats `json:"pids_stats,omitempty"` BlkioStats BlkioStats `json:"blkio_stats,omitempty"` // the map is in the format "size of hugepage: stats of the hugepage" HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"` RdmaStats RdmaStats `json:"rdma_stats,omitempty"` // the map is in the format "misc resource name: stats of the key" MiscStats map[string]MiscStats `json:"misc_stats,omitempty"` } func NewStats() *Stats { memoryStats := MemoryStats{Stats: make(map[string]uint64)} hugetlbStats := make(map[string]HugetlbStats) miscStats := make(map[string]MiscStats) return &Stats{MemoryStats: memoryStats, HugetlbStats: hugetlbStats, MiscStats: miscStats} } cgroups-0.0.4/systemd/000077500000000000000000000000001503527177300146775ustar00rootroot00000000000000cgroups-0.0.4/systemd/common.go000066400000000000000000000254631503527177300165300ustar00rootroot00000000000000package systemd import ( "context" "errors" "fmt" "math" "os" "strconv" "strings" "sync" "time" systemdDbus "github.com/coreos/go-systemd/v22/dbus" dbus "github.com/godbus/dbus/v5" "github.com/sirupsen/logrus" "github.com/opencontainers/cgroups" ) const ( // Default kernel value for cpu quota period is 100000 us (100 ms), same for v1 and v2. // v1: https://www.kernel.org/doc/html/latest/scheduler/sched-bwc.html and // v2: https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html defCPUQuotaPeriod = uint64(100000) ) var ( versionOnce sync.Once version int isRunningSystemdOnce sync.Once isRunningSystemd bool // GenerateDeviceProps is a function to generate systemd device // properties, used by Set methods. Unless // [github.com/opencontainers/cgroups/devices] // package is imported, it is set to nil, so cgroup managers can't // configure devices. GenerateDeviceProps func(r *cgroups.Resources, sdVer int) ([]systemdDbus.Property, error) ) // NOTE: This function comes from package github.com/coreos/go-systemd/util // It was borrowed here to avoid a dependency on cgo. // // IsRunningSystemd checks whether the host was booted with systemd as its init // system. This functions similarly to systemd's `sd_booted(3)`: internally, it // checks whether /run/systemd/system/ exists and is a directory. // http://www.freedesktop.org/software/systemd/man/sd_booted.html func IsRunningSystemd() bool { isRunningSystemdOnce.Do(func() { fi, err := os.Lstat("/run/systemd/system") isRunningSystemd = err == nil && fi.IsDir() }) return isRunningSystemd } // systemd represents slice hierarchy using `-`, so we need to follow suit when // generating the path of slice. Essentially, test-a-b.slice becomes // /test.slice/test-a.slice/test-a-b.slice. func ExpandSlice(slice string) (string, error) { suffix := ".slice" // Name has to end with ".slice", but can't be just ".slice". if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) { return "", fmt.Errorf("invalid slice name: %s", slice) } // Path-separators are not allowed. if strings.Contains(slice, "/") { return "", fmt.Errorf("invalid slice name: %s", slice) } var path, prefix string sliceName := strings.TrimSuffix(slice, suffix) // if input was -.slice, we should just return root now if sliceName == "-" { return "/", nil } for _, component := range strings.Split(sliceName, "-") { // test--a.slice isn't permitted, nor is -test.slice. if component == "" { return "", fmt.Errorf("invalid slice name: %s", slice) } // Append the component to the path and to the prefix. path += "/" + prefix + component + suffix prefix += component + "-" } return path, nil } func newProp(name string, units any) systemdDbus.Property { return systemdDbus.Property{ Name: name, Value: dbus.MakeVariant(units), } } func getUnitName(c *cgroups.Cgroup) string { // by default, we create a scope unless the user explicitly asks for a slice. if !strings.HasSuffix(c.Name, ".slice") { return c.ScopePrefix + "-" + c.Name + ".scope" } return c.Name } // This code should be in sync with getUnitName. func getUnitType(unitName string) string { if strings.HasSuffix(unitName, ".slice") { return "Slice" } return "Scope" } // isDbusError returns true if the error is a specific dbus error. func isDbusError(err error, name string) bool { if err != nil { var derr dbus.Error if errors.As(err, &derr) { return strings.Contains(derr.Name, name) } } return false } // isUnitExists returns true if the error is that a systemd unit already exists. func isUnitExists(err error) bool { return isDbusError(err, "org.freedesktop.systemd1.UnitExists") } func startUnit(cm *dbusConnManager, unitName string, properties []systemdDbus.Property, ignoreExist bool) error { statusChan := make(chan string, 1) retry := true retry: err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error { _, err := c.StartTransientUnitContext(context.TODO(), unitName, "replace", properties, statusChan) return err }) if err != nil { if !isUnitExists(err) { return err } if ignoreExist { // TODO: remove this hack. // This is kubelet making sure a slice exists (see // https://github.com/opencontainers/runc/pull/1124). return nil } if retry { // In case a unit with the same name exists, this may // be a leftover failed unit. Reset it, so systemd can // remove it, and retry once. err = resetFailedUnit(cm, unitName) if err != nil { logrus.Warnf("unable to reset failed unit: %v", err) } retry = false goto retry } return err } timeout := time.NewTimer(30 * time.Second) defer timeout.Stop() select { case s := <-statusChan: close(statusChan) // Please refer to https://pkg.go.dev/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit if s != "done" { _ = resetFailedUnit(cm, unitName) return fmt.Errorf("error creating systemd unit `%s`: got `%s`", unitName, s) } case <-timeout.C: _ = resetFailedUnit(cm, unitName) return errors.New("Timeout waiting for systemd to create " + unitName) } return nil } func stopUnit(cm *dbusConnManager, unitName string) error { statusChan := make(chan string, 1) err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error { _, err := c.StopUnitContext(context.TODO(), unitName, "replace", statusChan) return err }) if err == nil { timeout := time.NewTimer(30 * time.Second) defer timeout.Stop() select { case s := <-statusChan: close(statusChan) // Please refer to https://godoc.org/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit if s != "done" { logrus.Warnf("error removing unit `%s`: got `%s`. Continuing...", unitName, s) } case <-timeout.C: return errors.New("Timed out while waiting for systemd to remove " + unitName) } } // In case of a failed unit, let systemd remove it. _ = resetFailedUnit(cm, unitName) return nil } func resetFailedUnit(cm *dbusConnManager, name string) error { return cm.retryOnDisconnect(func(c *systemdDbus.Conn) error { return c.ResetFailedUnitContext(context.TODO(), name) }) } func getUnitTypeProperty(cm *dbusConnManager, unitName string, unitType string, propertyName string) (*systemdDbus.Property, error) { var prop *systemdDbus.Property err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) (Err error) { prop, Err = c.GetUnitTypePropertyContext(context.TODO(), unitName, unitType, propertyName) return Err }) return prop, err } func setUnitProperties(cm *dbusConnManager, name string, properties ...systemdDbus.Property) error { return cm.retryOnDisconnect(func(c *systemdDbus.Conn) error { return c.SetUnitPropertiesContext(context.TODO(), name, true, properties...) }) } func getManagerProperty(cm *dbusConnManager, name string) (string, error) { str := "" err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error { var err error str, err = c.GetManagerProperty(name) return err }) if err != nil { return "", err } return strconv.Unquote(str) } func systemdVersion(cm *dbusConnManager) int { versionOnce.Do(func() { version = -1 verStr, err := getManagerProperty(cm, "Version") if err == nil { version, err = systemdVersionAtoi(verStr) } if err != nil { logrus.WithError(err).Error("unable to get systemd version") } }) return version } // systemdVersionAtoi extracts a numeric systemd version from the argument. // The argument should be of the form: "v245.4-1.fc32", "245", "v245-1.fc32", // "245-1.fc32" (with or without quotes). The result for all of the above // should be 245. func systemdVersionAtoi(str string) (int, error) { // Unconditionally remove the leading prefix ("v). str = strings.TrimLeft(str, `"v`) // Match on the first integer we can grab. for i := range len(str) { if str[i] < '0' || str[i] > '9' { // First non-digit: cut the tail. str = str[:i] break } } ver, err := strconv.Atoi(str) if err != nil { return -1, fmt.Errorf("can't parse version: %w", err) } return ver, nil } // addCPUQuota adds CPUQuotaPeriodUSec and CPUQuotaPerSecUSec to the properties. The passed quota may be modified // along with round-up during calculation in order to write the same value to cgroupfs later. func addCPUQuota(cm *dbusConnManager, properties *[]systemdDbus.Property, quota *int64, period uint64) { if period != 0 { // systemd only supports CPUQuotaPeriodUSec since v242 sdVer := systemdVersion(cm) if sdVer >= 242 { *properties = append(*properties, newProp("CPUQuotaPeriodUSec", period)) } else { logrus.Debugf("systemd v%d is too old to support CPUQuotaPeriodSec "+ " (setting will still be applied to cgroupfs)", sdVer) } } if *quota != 0 || period != 0 { // corresponds to USEC_INFINITY in systemd cpuQuotaPerSecUSec := uint64(math.MaxUint64) if *quota > 0 { if period == 0 { // assume the default period = defCPUQuotaPeriod } // systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota // (integer percentage of CPU) internally. This means that if a fractional percent of // CPU is indicated by Resources.CpuQuota, we need to round up to the nearest // 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect. cpuQuotaPerSecUSec = uint64(*quota*1000000) / period if cpuQuotaPerSecUSec%10000 != 0 { cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000 // Update the requested quota along with the round-up in order to write the same value to cgroupfs. *quota = int64(cpuQuotaPerSecUSec) * int64(period) / 1000000 } } *properties = append(*properties, newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec)) } } func addCpuset(cm *dbusConnManager, props *[]systemdDbus.Property, cpus, mems string) error { if cpus == "" && mems == "" { return nil } // systemd only supports AllowedCPUs/AllowedMemoryNodes since v244 sdVer := systemdVersion(cm) if sdVer < 244 { logrus.Debugf("systemd v%d is too old to support AllowedCPUs/AllowedMemoryNodes"+ " (settings will still be applied to cgroupfs)", sdVer) return nil } if cpus != "" { bits, err := RangeToBits(cpus) if err != nil { return fmt.Errorf("resources.CPU.Cpus=%q conversion error: %w", cpus, err) } *props = append(*props, newProp("AllowedCPUs", bits)) } if mems != "" { bits, err := RangeToBits(mems) if err != nil { return fmt.Errorf("resources.CPU.Mems=%q conversion error: %w", mems, err) } *props = append(*props, newProp("AllowedMemoryNodes", bits)) } return nil } // generateDeviceProperties takes the configured device rules and generates a // corresponding set of systemd properties to configure the devices correctly. func generateDeviceProperties(r *cgroups.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) { if GenerateDeviceProps == nil { if len(r.Devices) > 0 { return nil, cgroups.ErrDevicesUnsupported } return nil, nil } return GenerateDeviceProps(r, systemdVersion(cm)) } cgroups-0.0.4/systemd/cpuset.go000066400000000000000000000025611503527177300165350ustar00rootroot00000000000000package systemd import ( "errors" "math/big" "strconv" "strings" ) // RangeToBits converts a text representation of a CPU mask (as written to // or read from cgroups' cpuset.* files, e.g. "1,3-5") to a slice of bytes // with the corresponding bits set (as consumed by systemd over dbus as // AllowedCPUs/AllowedMemoryNodes unit property value). func RangeToBits(str string) ([]byte, error) { bits := new(big.Int) for _, r := range strings.Split(str, ",") { // allow extra spaces around r = strings.TrimSpace(r) // allow empty elements (extra commas) if r == "" { continue } startr, endr, ok := strings.Cut(r, "-") if ok { start, err := strconv.ParseUint(startr, 10, 32) if err != nil { return nil, err } end, err := strconv.ParseUint(endr, 10, 32) if err != nil { return nil, err } if start > end { return nil, errors.New("invalid range: " + r) } for i := start; i <= end; i++ { bits.SetBit(bits, int(i), 1) } } else { val, err := strconv.ParseUint(startr, 10, 32) if err != nil { return nil, err } bits.SetBit(bits, int(val), 1) } } ret := bits.Bytes() if len(ret) == 0 { // do not allow empty values return nil, errors.New("empty value") } // fit cpuset parsing order in systemd for l, r := 0, len(ret)-1; l < r; l, r = l+1, r-1 { ret[l], ret[r] = ret[r], ret[l] } return ret, nil } cgroups-0.0.4/systemd/cpuset_test.go000066400000000000000000000025161503527177300175740ustar00rootroot00000000000000package systemd import ( "bytes" "testing" ) func TestRangeToBits(t *testing.T) { testCases := []struct { in string out []byte isErr bool }{ {in: "", isErr: true}, {in: "0", out: []byte{1}}, {in: "1", out: []byte{2}}, {in: "0-1", out: []byte{3}}, {in: "0,1", out: []byte{3}}, {in: ",0,1,", out: []byte{3}}, {in: "0-3", out: []byte{0x0f}}, {in: "0,1,2-3", out: []byte{0x0f}}, {in: "4-7", out: []byte{0xf0}}, {in: "0-7", out: []byte{0xff}}, {in: "0-15", out: []byte{0xff, 0xff}}, {in: "16", out: []byte{0, 0, 1}}, {in: "0-3,32-33", out: []byte{0x0f, 0, 0, 0, 3}}, // extra spaces and tabs are ok {in: "1, 2, 1-2", out: []byte{6}}, {in: " , 1 , 3 , 5-7, ", out: []byte{0xea}}, // somewhat large values {in: "128-130,1", out: []byte{2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7}}, {in: "-", isErr: true}, {in: "1-", isErr: true}, {in: "-3", isErr: true}, // bad range (start > end) {in: "54-53", isErr: true}, // kernel does not allow extra spaces inside a range {in: "1 - 2", isErr: true}, } for _, tc := range testCases { out, err := RangeToBits(tc.in) if err != nil { if !tc.isErr { t.Errorf("case %q: unexpected error: %v", tc.in, err) } continue } if !bytes.Equal(out, tc.out) { t.Errorf("case %q: expected %v, got %v", tc.in, tc.out, out) } } } cgroups-0.0.4/systemd/dbus.go000066400000000000000000000054241503527177300161700ustar00rootroot00000000000000package systemd import ( "context" "errors" "fmt" "sync" systemdDbus "github.com/coreos/go-systemd/v22/dbus" dbus "github.com/godbus/dbus/v5" ) var ( dbusC *systemdDbus.Conn dbusMu sync.RWMutex dbusInited bool dbusRootless bool ) type dbusConnManager struct{} // newDbusConnManager initializes systemd dbus connection manager. func newDbusConnManager(rootless bool) *dbusConnManager { dbusMu.Lock() defer dbusMu.Unlock() if dbusInited && rootless != dbusRootless { panic("can't have both root and rootless dbus") } dbusInited = true dbusRootless = rootless return &dbusConnManager{} } // getConnection lazily initializes and returns systemd dbus connection. func (d *dbusConnManager) getConnection() (*systemdDbus.Conn, error) { // In the case where dbusC != nil // Use the read lock the first time to ensure // that Conn can be acquired at the same time. dbusMu.RLock() if conn := dbusC; conn != nil { dbusMu.RUnlock() return conn, nil } dbusMu.RUnlock() // In the case where dbusC == nil // Use write lock to ensure that only one // will be created dbusMu.Lock() defer dbusMu.Unlock() if conn := dbusC; conn != nil { return conn, nil } conn, err := d.newConnection() if err != nil { // When dbus-user-session is not installed, we can't detect whether we should try to connect to user dbus or system dbus, so d.dbusRootless is set to false. // This may fail with a cryptic error "read unix @->/run/systemd/private: read: connection reset by peer: unknown." // https://github.com/moby/moby/issues/42793 return nil, fmt.Errorf("failed to connect to dbus (hint: for rootless containers, maybe you need to install dbus-user-session package, see https://github.com/opencontainers/runc/blob/master/docs/cgroup-v2.md): %w", err) } dbusC = conn return conn, nil } func (d *dbusConnManager) newConnection() (*systemdDbus.Conn, error) { if dbusRootless { return newUserSystemdDbus() } return systemdDbus.NewWithContext(context.TODO()) } // resetConnection resets the connection to its initial state // (so it can be reconnected if necessary). func (d *dbusConnManager) resetConnection(conn *systemdDbus.Conn) { dbusMu.Lock() defer dbusMu.Unlock() if dbusC != nil && dbusC == conn { dbusC.Close() dbusC = nil } } // retryOnDisconnect calls op, and if the error it returns is about closed dbus // connection, the connection is re-established and the op is retried. This helps // with the situation when dbus is restarted and we have a stale connection. func (d *dbusConnManager) retryOnDisconnect(op func(*systemdDbus.Conn) error) error { for { conn, err := d.getConnection() if err != nil { return err } err = op(conn) if err == nil { return nil } if !errors.Is(err, dbus.ErrClosed) { return err } d.resetConnection(conn) } } cgroups-0.0.4/systemd/devices.go000066400000000000000000000043571503527177300166610ustar00rootroot00000000000000package systemd import ( "reflect" dbus "github.com/godbus/dbus/v5" "github.com/opencontainers/cgroups" ) // freezeBeforeSet answers whether there is a need to freeze the cgroup before // applying its systemd unit properties, and thaw after, while avoiding // unnecessary freezer state changes. // // The reason why we have to freeze is that systemd's application of device // rules is done disruptively, resulting in spurious errors to common devices // (unlike our fs driver, they will happily write deny-all rules to running // containers). So we have to freeze the container to avoid the container get // an occasional "permission denied" error. func (m *LegacyManager) freezeBeforeSet(unitName string, r *cgroups.Resources) (needsFreeze, needsThaw bool, err error) { // Special case for SkipDevices, as used by Kubernetes to create pod // cgroups with allow-all device policy). if r.SkipDevices { if r.SkipFreezeOnSet { // Both needsFreeze and needsThaw are false. return } // No need to freeze if SkipDevices is set, and either // (1) systemd unit does not (yet) exist, or // (2) it has DevicePolicy=auto and empty DeviceAllow list. // // Interestingly, (1) and (2) are the same here because // a non-existent unit returns default properties, // and settings in (2) are the defaults. // // Do not return errors from getUnitTypeProperty, as they alone // should not prevent Set from working. unitType := getUnitType(unitName) devPolicy, e := getUnitTypeProperty(m.dbus, unitName, unitType, "DevicePolicy") if e == nil && devPolicy.Value == dbus.MakeVariant("auto") { devAllow, e := getUnitTypeProperty(m.dbus, unitName, unitType, "DeviceAllow") if e == nil { if rv := reflect.ValueOf(devAllow.Value.Value()); rv.Kind() == reflect.Slice && rv.Len() == 0 { needsFreeze = false needsThaw = false return } } } } needsFreeze = true needsThaw = true // Check the current freezer state. freezerState, err := m.GetFreezerState() if err != nil { return } if freezerState == cgroups.Frozen { // Already frozen, and should stay frozen. needsFreeze = false needsThaw = false } if r.Freezer == cgroups.Frozen { // Will be frozen anyway -- no need to thaw. needsThaw = false } return } cgroups-0.0.4/systemd/freeze_test.go000066400000000000000000000200741503527177300175500ustar00rootroot00000000000000package systemd import ( "bufio" "bytes" "os" "os/exec" "strings" "testing" "github.com/opencontainers/cgroups" "golang.org/x/sys/unix" ) func TestFreezeBeforeSet(t *testing.T) { requireV1(t) testCases := []struct { desc string // Test input. cg *cgroups.Cgroup preFreeze bool // Expected values. // Before unit creation (Apply). freeze0, thaw0 bool // After unit creation. freeze1, thaw1 bool }{ { // A slice with SkipDevices. desc: "slice,skip-devices", cg: &cgroups.Cgroup{ Name: "system-runc_test_freeze_1.slice", Parent: "system.slice", Resources: &cgroups.Resources{ SkipDevices: true, }, }, // Expected. freeze0: false, thaw0: false, freeze1: false, thaw1: false, }, { // A scope with SkipDevices. Not a realistic scenario with runc // (as container can't have SkipDevices == true), but possible // for a standalone cgroup manager. desc: "scope,skip-devices", cg: &cgroups.Cgroup{ ScopePrefix: "test", Name: "testFreeze2", Parent: "system.slice", Resources: &cgroups.Resources{ SkipDevices: true, }, }, // Expected. freeze0: false, thaw0: false, freeze1: false, thaw1: false, }, { // A slice that is about to be frozen in Set. desc: "slice,will-freeze", cg: &cgroups.Cgroup{ Name: "system-runc_test_freeze_3.slice", Parent: "system.slice", Resources: &cgroups.Resources{ Freezer: cgroups.Frozen, }, }, // Expected. freeze0: true, thaw0: false, freeze1: true, thaw1: false, }, { // A pre-frozen slice that should stay frozen. desc: "slice,pre-frozen,will-freeze", cg: &cgroups.Cgroup{ Name: "system-runc_test_freeze_4.slice", Parent: "system.slice", Resources: &cgroups.Resources{ Freezer: cgroups.Frozen, }, }, preFreeze: true, // Expected. freeze0: true, // not actually frozen yet. thaw0: false, freeze1: false, thaw1: false, }, { // A pre-frozen scope with skip devices set. desc: "scope,pre-frozen,skip-devices", cg: &cgroups.Cgroup{ ScopePrefix: "test", Name: "testFreeze5", Parent: "system.slice", Resources: &cgroups.Resources{ SkipDevices: true, }, }, preFreeze: true, // Expected. freeze0: false, thaw0: false, freeze1: false, thaw1: false, }, { // A pre-frozen scope which will be thawed. desc: "scope,pre-frozen", cg: &cgroups.Cgroup{ ScopePrefix: "test", Name: "testFreeze6", Parent: "system.slice", Resources: &cgroups.Resources{}, }, preFreeze: true, // Expected. freeze0: true, // not actually frozen yet. thaw0: true, freeze1: false, thaw1: false, }, } for _, tc := range testCases { tc := tc t.Run(tc.desc, func(t *testing.T) { m, err := NewLegacyManager(tc.cg, nil) if err != nil { t.Fatal(err) } defer m.Destroy() //nolint:errcheck // Checks for a non-existent unit. freeze, thaw, err := m.freezeBeforeSet(getUnitName(tc.cg), tc.cg.Resources) if err != nil { t.Fatal(err) } if freeze != tc.freeze0 || thaw != tc.thaw0 { t.Errorf("before Apply (non-existent unit): expected freeze: %v, thaw: %v, got freeze: %v, thaw: %v", tc.freeze0, tc.thaw0, freeze, thaw) } // Create systemd unit. pid := -1 if strings.HasSuffix(getUnitName(tc.cg), ".scope") { // Scopes require a process inside. cmd := exec.Command("bash", "-c", "sleep 1m") if err := cmd.Start(); err != nil { t.Fatal(err) } pid = cmd.Process.Pid // Make sure to not leave a zombie. defer func() { // These may fail, we don't care. _ = cmd.Process.Kill() _ = cmd.Wait() }() } if err := m.Apply(pid); err != nil { t.Fatal(err) } if tc.preFreeze { if err := m.Freeze(cgroups.Frozen); err != nil { t.Error(err) return // no more checks } } freeze, thaw, err = m.freezeBeforeSet(getUnitName(tc.cg), tc.cg.Resources) if err != nil { t.Error(err) return // no more checks } if freeze != tc.freeze1 || thaw != tc.thaw1 { t.Errorf("expected freeze: %v, thaw: %v, got freeze: %v, thaw: %v", tc.freeze1, tc.thaw1, freeze, thaw) } // Destroy() timeouts on a frozen container, so we need to thaw it. if tc.preFreeze { if err := m.Freeze(cgroups.Thawed); err != nil { t.Error(err) } } // Destroy() does not kill processes in cgroup, so we should. if pid != -1 { if err = unix.Kill(pid, unix.SIGKILL); err != nil { t.Errorf("unable to kill pid %d: %s", pid, err) } } // Not really needed, but may help catch some bugs. if err := m.Destroy(); err != nil { t.Errorf("destroy: %s", err) } }) } } // requireV1 skips the test unless a set of requirements (cgroup v1, // systemd, root) is met. func requireV1(t *testing.T) { t.Helper() if cgroups.IsCgroup2UnifiedMode() { t.Skip("Test requires cgroup v1.") } if !IsRunningSystemd() { t.Skip("Test requires systemd.") } if os.Geteuid() != 0 { t.Skip("Test requires root.") } } func TestFreezePodCgroup(t *testing.T) { if !IsRunningSystemd() { t.Skip("Test requires systemd.") } if os.Geteuid() != 0 { t.Skip("Test requires root.") } podConfig := &cgroups.Cgroup{ Parent: "system.slice", Name: "system-runc_test_pod.slice", Resources: &cgroups.Resources{ SkipDevices: true, Freezer: cgroups.Frozen, }, } // Create a "pod" cgroup (a systemd slice to hold containers), // which is frozen initially. pm := newManager(t, podConfig) if err := pm.Apply(-1); err != nil { t.Fatal(err) } if err := pm.Set(podConfig.Resources); err != nil { t.Fatal(err) } // Check the pod is frozen. pf, err := pm.GetFreezerState() if err != nil { t.Fatal(err) } if pf != cgroups.Frozen { t.Fatalf("expected pod to be frozen, got %v", pf) } // Create a "container" within the "pod" cgroup. // This is not a real container, just a process in the cgroup. containerConfig := &cgroups.Cgroup{ Parent: "system-runc_test_pod.slice", ScopePrefix: "test", Name: "inner-container", Resources: &cgroups.Resources{}, } cmd := exec.Command("bash", "-c", "while read; do echo $REPLY; done") cmd.Env = append(os.Environ(), "LANG=C") // Setup stdin. stdinR, stdinW, err := os.Pipe() if err != nil { t.Fatal(err) } cmd.Stdin = stdinR // Setup stdout. stdoutR, stdoutW, err := os.Pipe() if err != nil { t.Fatal(err) } cmd.Stdout = stdoutW rdr := bufio.NewReader(stdoutR) // Setup stderr. var stderr bytes.Buffer cmd.Stderr = &stderr err = cmd.Start() stdinR.Close() stdoutW.Close() defer func() { _ = stdinW.Close() _ = stdoutR.Close() }() if err != nil { t.Fatal(err) } // Make sure to not leave a zombie. defer func() { // These may fail, we don't care. _ = cmd.Process.Kill() _ = cmd.Wait() }() // Put the process into a cgroup. cm := newManager(t, containerConfig) if err := cm.Apply(cmd.Process.Pid); err != nil { t.Fatal(err) } if err := cm.Set(containerConfig.Resources); err != nil { t.Fatal(err) } // Check that we put the "container" into the "pod" cgroup. if !strings.HasPrefix(cm.Path("freezer"), pm.Path("freezer")) { t.Fatalf("expected container cgroup path %q to be under pod cgroup path %q", cm.Path("freezer"), pm.Path("freezer")) } // Check the container is not reported as frozen despite the frozen parent. cf, err := cm.GetFreezerState() if err != nil { t.Fatal(err) } if cf != cgroups.Thawed { t.Fatalf("expected container to be thawed, got %v", cf) } // Unfreeze the pod. if err := pm.Freeze(cgroups.Thawed); err != nil { t.Fatal(err) } cf, err = cm.GetFreezerState() if err != nil { t.Fatal(err) } if cf != cgroups.Thawed { t.Fatalf("expected container to be thawed, got %v", cf) } // Check the "container" works. marker := "one two\n" _, err = stdinW.WriteString(marker) if err != nil { t.Fatal(err) } reply, err := rdr.ReadString('\n') if err != nil { t.Fatalf("reading from container: %v", err) } if reply != marker { t.Fatalf("expected %q, got %q", marker, reply) } } cgroups-0.0.4/systemd/systemd_test.go000066400000000000000000000133431503527177300177610ustar00rootroot00000000000000package systemd import ( "os" "reflect" "testing" systemdDbus "github.com/coreos/go-systemd/v22/dbus" "github.com/opencontainers/cgroups" ) func newManager(t *testing.T, config *cgroups.Cgroup) (m cgroups.Manager) { t.Helper() var err error if cgroups.IsCgroup2UnifiedMode() { m, err = NewUnifiedManager(config, "") } else { m, err = NewLegacyManager(config, nil) } if err != nil { t.Fatal(err) } t.Cleanup(func() { _ = m.Destroy() }) return m } func TestSystemdVersion(t *testing.T) { systemdVersionTests := []struct { verStr string expectedVer int expectErr bool }{ {`"219"`, 219, false}, {`"v245.4-1.fc32"`, 245, false}, {`"241-1"`, 241, false}, {`"v241-1"`, 241, false}, {`333.45"`, 333, false}, {`v321-0`, 321, false}, {"NaN", -1, true}, {"", -1, true}, {"v", -1, true}, } for _, sdTest := range systemdVersionTests { ver, err := systemdVersionAtoi(sdTest.verStr) if !sdTest.expectErr && err != nil { t.Errorf("systemdVersionAtoi(%s); want nil; got %v", sdTest.verStr, err) } if sdTest.expectErr && err == nil { t.Errorf("systemdVersionAtoi(%s); wanted failure; got nil", sdTest.verStr) } if ver != sdTest.expectedVer { t.Errorf("systemdVersionAtoi(%s); want %d; got %d", sdTest.verStr, sdTest.expectedVer, ver) } } } func TestValidUnitTypes(t *testing.T) { testCases := []struct { unitName string expectedUnitType string }{ {"system.slice", "Slice"}, {"kubepods.slice", "Slice"}, {"testing-container:ab.scope", "Scope"}, } for _, sdTest := range testCases { unitType := getUnitType(sdTest.unitName) if unitType != sdTest.expectedUnitType { t.Errorf("getUnitType(%s); want %q; got %q", sdTest.unitName, sdTest.expectedUnitType, unitType) } } } func TestUnitExistsIgnored(t *testing.T) { if !IsRunningSystemd() { t.Skip("Test requires systemd.") } if os.Geteuid() != 0 { t.Skip("Test requires root.") } podConfig := &cgroups.Cgroup{ Parent: "system.slice", Name: "system-runc_test_exists.slice", Resources: &cgroups.Resources{}, } // Create "pods" cgroup (a systemd slice to hold containers). pm := newManager(t, podConfig) // create twice to make sure "UnitExists" error is ignored. for range 2 { if err := pm.Apply(-1); err != nil { t.Fatal(err) } } } func TestUnifiedResToSystemdProps(t *testing.T) { if !IsRunningSystemd() { t.Skip("Test requires systemd.") } if !cgroups.IsCgroup2UnifiedMode() { t.Skip("cgroup v2 is required") } cm := newDbusConnManager(os.Geteuid() != 0) testCases := []struct { name string minVer int res map[string]string expError bool expProps []systemdDbus.Property }{ { name: "empty map", res: map[string]string{}, }, { name: "only cpu.idle=1", minVer: cpuIdleSupportedVersion, res: map[string]string{ "cpu.idle": "1", }, expProps: []systemdDbus.Property{ newProp("CPUWeight", uint64(0)), }, }, { name: "only cpu.idle=0", minVer: cpuIdleSupportedVersion, res: map[string]string{ "cpu.idle": "0", }, }, { name: "cpu.idle=1 and cpu.weight=1000", minVer: cpuIdleSupportedVersion, res: map[string]string{ "cpu.idle": "1", "cpu.weight": "1000", }, expProps: []systemdDbus.Property{ newProp("CPUWeight", uint64(0)), }, }, { name: "cpu.idle=0 and cpu.weight=1000", minVer: cpuIdleSupportedVersion, res: map[string]string{ "cpu.idle": "0", "cpu.weight": "1000", }, expProps: []systemdDbus.Property{ newProp("CPUWeight", uint64(1000)), }, }, } for _, tc := range testCases { tc := tc t.Run(tc.name, func(t *testing.T) { if tc.minVer != 0 && systemdVersion(cm) < tc.minVer { t.Skipf("requires systemd >= %d", tc.minVer) } props, err := unifiedResToSystemdProps(cm, tc.res) if err != nil && !tc.expError { t.Fatalf("expected no error, got: %v", err) } if err == nil && tc.expError { t.Fatal("expected error, got nil") } if !reflect.DeepEqual(tc.expProps, props) { t.Errorf("wrong properties (exp %+v, got %+v)", tc.expProps, props) } }) } } func TestAddCPUQuota(t *testing.T) { if !IsRunningSystemd() { t.Skip("Test requires systemd.") } cm := newDbusConnManager(os.Geteuid() != 0) testCases := []struct { name string quota int64 period uint64 expectedCPUQuotaPerSecUSec uint64 expectedQuota int64 }{ { name: "No round up", quota: 500000, period: 1000000, expectedCPUQuotaPerSecUSec: 500000, expectedQuota: 500000, }, { name: "With fraction", quota: 123456, expectedCPUQuotaPerSecUSec: 1240000, expectedQuota: 124000, }, { name: "Round up at division", quota: 500000, period: 900000, expectedCPUQuotaPerSecUSec: 560000, expectedQuota: 504000, }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { props := []systemdDbus.Property{} addCPUQuota(cm, &props, &tc.quota, tc.period) var cpuQuotaPerSecUSec uint64 for _, p := range props { if p.Name == "CPUQuotaPerSecUSec" { if err := p.Value.Store(&cpuQuotaPerSecUSec); err != nil { t.Errorf("failed to parse CPUQuotaPerSecUSec: %v", err) } } } if cpuQuotaPerSecUSec != tc.expectedCPUQuotaPerSecUSec { t.Errorf("CPUQuotaPerSecUSec is not set as expected (exp: %v, got: %v)", tc.expectedCPUQuotaPerSecUSec, cpuQuotaPerSecUSec) } if tc.quota != tc.expectedQuota { t.Errorf("quota is not updated as expected (exp: %v, got: %v)", tc.expectedQuota, tc.quota) } }) } } cgroups-0.0.4/systemd/user.go000066400000000000000000000053701503527177300162110ustar00rootroot00000000000000package systemd import ( "bufio" "bytes" "errors" "fmt" "os" "os/exec" "path/filepath" "strconv" "strings" systemdDbus "github.com/coreos/go-systemd/v22/dbus" dbus "github.com/godbus/dbus/v5" "github.com/moby/sys/userns" ) // newUserSystemdDbus creates a connection for systemd user-instance. func newUserSystemdDbus() (*systemdDbus.Conn, error) { addr, err := DetectUserDbusSessionBusAddress() if err != nil { return nil, err } uid, err := DetectUID() if err != nil { return nil, err } return systemdDbus.NewConnection(func() (*dbus.Conn, error) { conn, err := dbus.Dial(addr) if err != nil { return nil, fmt.Errorf("error while dialing %q: %w", addr, err) } methods := []dbus.Auth{dbus.AuthExternal(strconv.Itoa(uid))} err = conn.Auth(methods) if err != nil { conn.Close() return nil, fmt.Errorf("error while authenticating connection (address=%q, UID=%d): %w", addr, uid, err) } if err = conn.Hello(); err != nil { conn.Close() return nil, fmt.Errorf("error while sending Hello message (address=%q, UID=%d): %w", addr, uid, err) } return conn, nil }) } // DetectUID detects UID from the OwnerUID field of `busctl --user status` // if running in userNS. The value corresponds to sd_bus_creds_get_owner_uid(3) . // // Otherwise returns os.Getuid() . func DetectUID() (int, error) { if !userns.RunningInUserNS() { return os.Getuid(), nil } b, err := exec.Command("busctl", "--user", "--no-pager", "status").CombinedOutput() if err != nil { return -1, fmt.Errorf("could not execute `busctl --user --no-pager status` (output: %q): %w", string(b), err) } scanner := bufio.NewScanner(bytes.NewReader(b)) for scanner.Scan() { s := strings.TrimSpace(scanner.Text()) if uidStr, ok := strings.CutPrefix(s, "OwnerUID="); ok { i, err := strconv.Atoi(uidStr) if err != nil { return -1, fmt.Errorf("could not detect the OwnerUID: %w", err) } return i, nil } } if err := scanner.Err(); err != nil { return -1, err } return -1, errors.New("could not detect the OwnerUID") } // DetectUserDbusSessionBusAddress returns $DBUS_SESSION_BUS_ADDRESS, if set. // Otherwise it returns "unix:path=$XDG_RUNTIME_DIR/bus", if $XDG_RUNTIME_DIR/bus exists. func DetectUserDbusSessionBusAddress() (string, error) { if env := os.Getenv("DBUS_SESSION_BUS_ADDRESS"); env != "" { return env, nil } if xdr := os.Getenv("XDG_RUNTIME_DIR"); xdr != "" { busPath := filepath.Join(xdr, "bus") if _, err := os.Stat(busPath); err == nil { busAddress := "unix:path=" + dbus.EscapeBusAddressValue(busPath) return busAddress, nil } } return "", errors.New("could not detect DBUS_SESSION_BUS_ADDRESS from the environment; make sure you have installed the dbus-user-session or dbus-daemon package; note you may need to re-login") } cgroups-0.0.4/systemd/v1.go000066400000000000000000000236541503527177300155660ustar00rootroot00000000000000package systemd import ( "errors" "os" "path/filepath" "strings" "sync" systemdDbus "github.com/coreos/go-systemd/v22/dbus" "github.com/sirupsen/logrus" "github.com/opencontainers/cgroups" "github.com/opencontainers/cgroups/fs" ) type LegacyManager struct { mu sync.Mutex cgroups *cgroups.Cgroup paths map[string]string dbus *dbusConnManager } func NewLegacyManager(cg *cgroups.Cgroup, paths map[string]string) (*LegacyManager, error) { if cg.Rootless { return nil, errors.New("cannot use rootless systemd cgroups manager on cgroup v1") } if cg.Resources != nil && cg.Resources.Unified != nil { return nil, cgroups.ErrV1NoUnified } if paths == nil { var err error paths, err = initPaths(cg) if err != nil { return nil, err } } return &LegacyManager{ cgroups: cg, paths: paths, dbus: newDbusConnManager(false), }, nil } type subsystem interface { // Name returns the name of the subsystem. Name() string // GetStats returns the stats, as 'stats', corresponding to the cgroup under 'path'. GetStats(path string, stats *cgroups.Stats) error // Set sets cgroup resource limits. Set(path string, r *cgroups.Resources) error } var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist") var legacySubsystems = []subsystem{ &fs.CpusetGroup{}, &fs.DevicesGroup{}, &fs.MemoryGroup{}, &fs.CpuGroup{}, &fs.CpuacctGroup{}, &fs.PidsGroup{}, &fs.BlkioGroup{}, &fs.HugetlbGroup{}, &fs.PerfEventGroup{}, &fs.FreezerGroup{}, &fs.NetPrioGroup{}, &fs.NetClsGroup{}, &fs.NameGroup{GroupName: "name=systemd"}, &fs.RdmaGroup{}, &fs.NameGroup{GroupName: "misc"}, } func genV1ResourcesProperties(r *cgroups.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) { var properties []systemdDbus.Property deviceProperties, err := generateDeviceProperties(r, cm) if err != nil { return nil, err } properties = append(properties, deviceProperties...) if r.Memory != 0 { properties = append(properties, newProp("MemoryLimit", uint64(r.Memory))) } if r.CpuShares != 0 { properties = append(properties, newProp("CPUShares", r.CpuShares)) } addCPUQuota(cm, &properties, &r.CpuQuota, r.CpuPeriod) if r.BlkioWeight != 0 { properties = append(properties, newProp("BlockIOWeight", uint64(r.BlkioWeight))) } if r.PidsLimit > 0 || r.PidsLimit == -1 { properties = append(properties, newProp("TasksMax", uint64(r.PidsLimit))) } err = addCpuset(cm, &properties, r.CpusetCpus, r.CpusetMems) if err != nil { return nil, err } return properties, nil } // initPaths figures out and returns paths to cgroups. func initPaths(c *cgroups.Cgroup) (map[string]string, error) { slice := "system.slice" if c.Parent != "" { var err error slice, err = ExpandSlice(c.Parent) if err != nil { return nil, err } } unit := getUnitName(c) paths := make(map[string]string) for _, s := range legacySubsystems { subsystemPath, err := getSubsystemPath(slice, unit, s.Name()) if err != nil { // Even if it's `not found` error, we'll return err // because devices cgroup is hard requirement for // container security. if s.Name() == "devices" { return nil, err } // Don't fail if a cgroup hierarchy was not found, just skip this subsystem if cgroups.IsNotFound(err) { continue } return nil, err } paths[s.Name()] = subsystemPath } // If systemd is using cgroups-hybrid mode then add the slice path of // this container to the paths so the following process executed with // "runc exec" joins that cgroup as well. if cgroups.IsCgroup2HybridMode() { // "" means cgroup-hybrid path cgroupsHybridPath, err := getSubsystemPath(slice, unit, "") if err != nil && cgroups.IsNotFound(err) { return nil, err } paths[""] = cgroupsHybridPath } return paths, nil } func (m *LegacyManager) Apply(pid int) error { var ( c = m.cgroups unitName = getUnitName(c) slice = "system.slice" properties []systemdDbus.Property ) m.mu.Lock() defer m.mu.Unlock() if c.Parent != "" { slice = c.Parent } properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name)) if strings.HasSuffix(unitName, ".slice") { // If we create a slice, the parent is defined via a Wants=. properties = append(properties, systemdDbus.PropWants(slice)) } else { // Otherwise it's a scope, which we put into a Slice=. properties = append(properties, systemdDbus.PropSlice(slice)) // Assume scopes always support delegation (supported since systemd v218). properties = append(properties, newProp("Delegate", true)) } // only add pid if its valid, -1 is used w/ general slice creation. if pid != -1 { properties = append(properties, newProp("PIDs", []uint32{uint32(pid)})) } // Always enable accounting, this gets us the same behaviour as the fs implementation, // plus the kernel has some problems with joining the memory cgroup at a later time. properties = append(properties, newProp("MemoryAccounting", true), newProp("CPUAccounting", true), newProp("BlockIOAccounting", true), newProp("TasksAccounting", true), ) // Assume DefaultDependencies= will always work (the check for it was previously broken.) properties = append(properties, newProp("DefaultDependencies", false)) properties = append(properties, c.SystemdProps...) if err := startUnit(m.dbus, unitName, properties, pid == -1); err != nil { return err } if err := m.joinCgroups(pid); err != nil { return err } return nil } func (m *LegacyManager) Destroy() error { m.mu.Lock() defer m.mu.Unlock() stopErr := stopUnit(m.dbus, getUnitName(m.cgroups)) // Both on success and on error, cleanup all the cgroups // we are aware of, as some of them were created directly // by Apply() and are not managed by systemd. if err := cgroups.RemovePaths(m.paths); err != nil && stopErr == nil { return err } return stopErr } func (m *LegacyManager) Path(subsys string) string { m.mu.Lock() defer m.mu.Unlock() return m.paths[subsys] } func (m *LegacyManager) joinCgroups(pid int) error { for _, sys := range legacySubsystems { name := sys.Name() switch name { case "name=systemd": // let systemd handle this case "cpuset": if path, ok := m.paths[name]; ok { s := &fs.CpusetGroup{} if err := s.ApplyDir(path, m.cgroups.Resources, pid); err != nil { return err } } default: if path, ok := m.paths[name]; ok { if err := os.MkdirAll(path, 0o755); err != nil { return err } if err := cgroups.WriteCgroupProc(path, pid); err != nil { return err } } } } return nil } func getSubsystemPath(slice, unit, subsystem string) (string, error) { mountpoint, err := cgroups.FindCgroupMountpoint("", subsystem) if err != nil { return "", err } return filepath.Join(mountpoint, slice, unit), nil } func (m *LegacyManager) Freeze(state cgroups.FreezerState) error { err := m.doFreeze(state) if err == nil { m.cgroups.Resources.Freezer = state } return err } // doFreeze is the same as Freeze but without // changing the m.cgroups.Resources.Frozen field. func (m *LegacyManager) doFreeze(state cgroups.FreezerState) error { path, ok := m.paths["freezer"] if !ok { return errSubsystemDoesNotExist } freezer := &fs.FreezerGroup{} resources := &cgroups.Resources{Freezer: state} return freezer.Set(path, resources) } func (m *LegacyManager) GetPids() ([]int, error) { path, ok := m.paths["devices"] if !ok { return nil, errSubsystemDoesNotExist } return cgroups.GetPids(path) } func (m *LegacyManager) GetAllPids() ([]int, error) { path, ok := m.paths["devices"] if !ok { return nil, errSubsystemDoesNotExist } return cgroups.GetAllPids(path) } func (m *LegacyManager) GetStats() (*cgroups.Stats, error) { m.mu.Lock() defer m.mu.Unlock() stats := cgroups.NewStats() for _, sys := range legacySubsystems { path := m.paths[sys.Name()] if path == "" { continue } if err := sys.GetStats(path, stats); err != nil { return nil, err } } return stats, nil } func (m *LegacyManager) Set(r *cgroups.Resources) error { if r == nil { return nil } if r.Unified != nil { return cgroups.ErrV1NoUnified } // Use a copy since CpuQuota in r may be modified. rCopy := *r r = &rCopy properties, err := genV1ResourcesProperties(r, m.dbus) if err != nil { return err } unitName := getUnitName(m.cgroups) needsFreeze, needsThaw, err := m.freezeBeforeSet(unitName, r) if err != nil { return err } if needsFreeze { if err := m.doFreeze(cgroups.Frozen); err != nil { // If freezer cgroup isn't supported, we just warn about it. logrus.Infof("freeze container before SetUnitProperties failed: %v", err) // skip update the cgroup while frozen failed. #3803 if !errors.Is(err, errSubsystemDoesNotExist) { if needsThaw { if thawErr := m.doFreeze(cgroups.Thawed); thawErr != nil { logrus.Infof("thaw container after doFreeze failed: %v", thawErr) } } return err } } } setErr := setUnitProperties(m.dbus, unitName, properties...) if needsThaw { if err := m.doFreeze(cgroups.Thawed); err != nil { logrus.Infof("thaw container after SetUnitProperties failed: %v", err) } } if setErr != nil { return setErr } for _, sys := range legacySubsystems { // Get the subsystem path, but don't error out for not found cgroups. path, ok := m.paths[sys.Name()] if !ok { continue } if err := sys.Set(path, r); err != nil { return err } } return nil } func (m *LegacyManager) GetPaths() map[string]string { m.mu.Lock() defer m.mu.Unlock() return m.paths } func (m *LegacyManager) GetCgroups() (*cgroups.Cgroup, error) { return m.cgroups, nil } func (m *LegacyManager) GetFreezerState() (cgroups.FreezerState, error) { path, ok := m.paths["freezer"] if !ok { return cgroups.Undefined, nil } freezer := &fs.FreezerGroup{} return freezer.GetState(path) } func (m *LegacyManager) Exists() bool { return cgroups.PathExists(m.Path("devices")) } func (m *LegacyManager) OOMKillCount() (uint64, error) { return fs.OOMKillCount(m.Path("memory")) } cgroups-0.0.4/systemd/v2.go000066400000000000000000000343151503527177300155630ustar00rootroot00000000000000package systemd import ( "bufio" "errors" "fmt" "math" "os" "path/filepath" "strconv" "strings" "sync" systemdDbus "github.com/coreos/go-systemd/v22/dbus" securejoin "github.com/cyphar/filepath-securejoin" "github.com/sirupsen/logrus" "github.com/opencontainers/cgroups" "github.com/opencontainers/cgroups/fs2" ) const ( cpuIdleSupportedVersion = 252 ) type UnifiedManager struct { mu sync.Mutex cgroups *cgroups.Cgroup // path is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope" path string dbus *dbusConnManager fsMgr cgroups.Manager } func NewUnifiedManager(config *cgroups.Cgroup, path string) (*UnifiedManager, error) { m := &UnifiedManager{ cgroups: config, path: path, dbus: newDbusConnManager(config.Rootless), } if err := m.initPath(); err != nil { return nil, err } fsMgr, err := fs2.NewManager(config, m.path) if err != nil { return nil, err } m.fsMgr = fsMgr return m, nil } func shouldSetCPUIdle(cm *dbusConnManager, v string) bool { // The only valid values for cpu.idle are 0 and 1. As it is // not possible to directly set cpu.idle to 0 via systemd, // ignore 0. Ignore other values as we'll error out later // in Set() while calling fsMgr.Set(). return v == "1" && systemdVersion(cm) >= cpuIdleSupportedVersion } // unifiedResToSystemdProps tries to convert from Cgroup.Resources.Unified // key/value map (where key is cgroupfs file name) to systemd unit properties. // This is on a best-effort basis, so the properties that are not known // (to this function and/or systemd) are ignored (but logged with "debug" // log level). // // For the list of keys, see https://www.kernel.org/doc/Documentation/cgroup-v2.txt // // For the list of systemd unit properties, see systemd.resource-control(5). func unifiedResToSystemdProps(cm *dbusConnManager, res map[string]string) (props []systemdDbus.Property, _ error) { var err error for k, v := range res { if strings.Contains(k, "/") { return nil, fmt.Errorf("unified resource %q must be a file name (no slashes)", k) } if strings.IndexByte(k, '.') <= 0 { return nil, fmt.Errorf("unified resource %q must be in the form CONTROLLER.PARAMETER", k) } // Kernel is quite forgiving to extra whitespace // around the value, and so should we. v = strings.TrimSpace(v) // Please keep cases in alphabetical order. switch k { case "cpu.idle": if shouldSetCPUIdle(cm, v) { // Setting CPUWeight to 0 tells systemd // to set cpu.idle to 1. props = append(props, newProp("CPUWeight", uint64(0))) } case "cpu.max": // value: quota [period] quota := int64(0) // 0 means "unlimited" for addCpuQuota, if period is set period := defCPUQuotaPeriod sv := strings.Fields(v) if len(sv) < 1 || len(sv) > 2 { return nil, fmt.Errorf("unified resource %q value invalid: %q", k, v) } // quota if sv[0] != "max" { quota, err = strconv.ParseInt(sv[0], 10, 64) if err != nil { return nil, fmt.Errorf("unified resource %q period value conversion error: %w", k, err) } } // period if len(sv) == 2 { period, err = strconv.ParseUint(sv[1], 10, 64) if err != nil { return nil, fmt.Errorf("unified resource %q quota value conversion error: %w", k, err) } } addCPUQuota(cm, &props, "a, period) case "cpu.weight": if shouldSetCPUIdle(cm, strings.TrimSpace(res["cpu.idle"])) { // Do not add duplicate CPUWeight property // (see case "cpu.idle" above). logrus.Warn("unable to apply both cpu.weight and cpu.idle to systemd, ignoring cpu.weight") continue } num, err := strconv.ParseUint(v, 10, 64) if err != nil { return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err) } props = append(props, newProp("CPUWeight", num)) case "cpuset.cpus", "cpuset.mems": bits, err := RangeToBits(v) if err != nil { return nil, fmt.Errorf("unified resource %q=%q conversion error: %w", k, v, err) } m := map[string]string{ "cpuset.cpus": "AllowedCPUs", "cpuset.mems": "AllowedMemoryNodes", } // systemd only supports these properties since v244 sdVer := systemdVersion(cm) if sdVer >= 244 { props = append(props, newProp(m[k], bits)) } else { logrus.Debugf("systemd v%d is too old to support %s"+ " (setting will still be applied to cgroupfs)", sdVer, m[k]) } case "memory.high", "memory.low", "memory.min", "memory.max", "memory.swap.max": num := uint64(math.MaxUint64) if v != "max" { num, err = strconv.ParseUint(v, 10, 64) if err != nil { return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err) } } m := map[string]string{ "memory.high": "MemoryHigh", "memory.low": "MemoryLow", "memory.min": "MemoryMin", "memory.max": "MemoryMax", "memory.swap.max": "MemorySwapMax", } props = append(props, newProp(m[k], num)) case "pids.max": num := uint64(math.MaxUint64) if v != "max" { var err error num, err = strconv.ParseUint(v, 10, 64) if err != nil { return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err) } } props = append(props, newProp("TasksMax", num)) case "memory.oom.group": // Setting this to 1 is roughly equivalent to OOMPolicy=kill // (as per systemd.service(5) and // https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html), // but it's not clear what to do if it is unset or set // to 0 in runc update, as there are two other possible // values for OOMPolicy (continue/stop). fallthrough default: // Ignore the unknown resource here -- will still be // applied in Set which calls fs2.Set. logrus.Debugf("don't know how to convert unified resource %q=%q to systemd unit property; skipping (will still be applied to cgroupfs)", k, v) } } return props, nil } func genV2ResourcesProperties(dirPath string, r *cgroups.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) { // We need this check before setting systemd properties, otherwise // the container is OOM-killed and the systemd unit is removed // before we get to fsMgr.Set(). if err := fs2.CheckMemoryUsage(dirPath, r); err != nil { return nil, err } var properties []systemdDbus.Property // NOTE: This is of questionable correctness because we insert our own // devices eBPF program later. Two programs with identical rules // aren't the end of the world, but it is a bit concerning. However // it's unclear if systemd removes all eBPF programs attached when // doing SetUnitProperties... deviceProperties, err := generateDeviceProperties(r, cm) if err != nil { return nil, err } properties = append(properties, deviceProperties...) if r.Memory != 0 { properties = append(properties, newProp("MemoryMax", uint64(r.Memory))) } if r.MemoryReservation != 0 { properties = append(properties, newProp("MemoryLow", uint64(r.MemoryReservation))) } swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory) if err != nil { return nil, err } if swap != 0 { properties = append(properties, newProp("MemorySwapMax", uint64(swap))) } idleSet := false // The logic here is the same as in shouldSetCPUIdle. if r.CPUIdle != nil && *r.CPUIdle == 1 && systemdVersion(cm) >= cpuIdleSupportedVersion { properties = append(properties, newProp("CPUWeight", uint64(0))) idleSet = true } if r.CpuWeight != 0 { if idleSet { // Ignore CpuWeight if CPUIdle is already set. logrus.Warn("unable to apply both CPUWeight and CpuIdle to systemd, ignoring CPUWeight") } else { properties = append(properties, newProp("CPUWeight", r.CpuWeight)) } } addCPUQuota(cm, &properties, &r.CpuQuota, r.CpuPeriod) if r.PidsLimit > 0 || r.PidsLimit == -1 { properties = append(properties, newProp("TasksMax", uint64(r.PidsLimit))) } err = addCpuset(cm, &properties, r.CpusetCpus, r.CpusetMems) if err != nil { return nil, err } // ignore r.KernelMemory // convert Resources.Unified map to systemd properties if r.Unified != nil { unifiedProps, err := unifiedResToSystemdProps(cm, r.Unified) if err != nil { return nil, err } properties = append(properties, unifiedProps...) } return properties, nil } func (m *UnifiedManager) Apply(pid int) error { var ( c = m.cgroups unitName = getUnitName(c) properties []systemdDbus.Property ) slice := "system.slice" if m.cgroups.Rootless { slice = "user.slice" } if c.Parent != "" { slice = c.Parent } properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name)) if strings.HasSuffix(unitName, ".slice") { // If we create a slice, the parent is defined via a Wants=. properties = append(properties, systemdDbus.PropWants(slice)) } else { // Otherwise it's a scope, which we put into a Slice=. properties = append(properties, systemdDbus.PropSlice(slice)) // Assume scopes always support delegation (supported since systemd v218). properties = append(properties, newProp("Delegate", true)) } // only add pid if its valid, -1 is used w/ general slice creation. if pid != -1 { properties = append(properties, newProp("PIDs", []uint32{uint32(pid)})) } // Always enable accounting, this gets us the same behaviour as the fs implementation, // plus the kernel has some problems with joining the memory cgroup at a later time. properties = append(properties, newProp("MemoryAccounting", true), newProp("CPUAccounting", true), newProp("IOAccounting", true), newProp("TasksAccounting", true), ) // Assume DefaultDependencies= will always work (the check for it was previously broken.) properties = append(properties, newProp("DefaultDependencies", false)) properties = append(properties, c.SystemdProps...) if err := startUnit(m.dbus, unitName, properties, pid == -1); err != nil { return fmt.Errorf("unable to start unit %q (properties %+v): %w", unitName, properties, err) } if err := fs2.CreateCgroupPath(m.path, m.cgroups); err != nil { return err } if c.OwnerUID != nil { // The directory itself must be chowned. err := os.Chown(m.path, *c.OwnerUID, -1) if err != nil { return err } filesToChown, err := cgroupFilesToChown() if err != nil { return err } for _, v := range filesToChown { err := os.Chown(m.path+"/"+v, *c.OwnerUID, -1) // Some files might not be present. if err != nil && !errors.Is(err, os.ErrNotExist) { return err } } } return nil } // The kernel exposes a list of files that should be chowned to the delegate // uid in /sys/kernel/cgroup/delegate. If the file is not present // (Linux < 4.15), use the initial values mentioned in cgroups(7). func cgroupFilesToChown() ([]string, error) { const cgroupDelegateFile = "/sys/kernel/cgroup/delegate" f, err := os.Open(cgroupDelegateFile) if err != nil { return []string{"cgroup.procs", "cgroup.subtree_control", "cgroup.threads"}, nil } defer f.Close() filesToChown := []string{} scanner := bufio.NewScanner(f) for scanner.Scan() { filesToChown = append(filesToChown, scanner.Text()) } if err := scanner.Err(); err != nil { return nil, fmt.Errorf("error reading %s: %w", cgroupDelegateFile, err) } return filesToChown, nil } func (m *UnifiedManager) Destroy() error { m.mu.Lock() defer m.mu.Unlock() unitName := getUnitName(m.cgroups) if err := stopUnit(m.dbus, unitName); err != nil { return err } // systemd 239 do not remove sub-cgroups. err := m.fsMgr.Destroy() // fsMgr.Destroy has handled ErrNotExist if err != nil { return err } return nil } func (m *UnifiedManager) Path(_ string) string { return m.path } // getSliceFull value is used in initPath. // The value is incompatible with systemdDbus.PropSlice. func (m *UnifiedManager) getSliceFull() (string, error) { c := m.cgroups slice := "system.slice" if c.Rootless { slice = "user.slice" } if c.Parent != "" { var err error slice, err = ExpandSlice(c.Parent) if err != nil { return "", err } } if c.Rootless { // managerCG is typically "/user.slice/user-${uid}.slice/user@${uid}.service". managerCG, err := getManagerProperty(m.dbus, "ControlGroup") if err != nil { return "", err } slice = filepath.Join(managerCG, slice) } // an example of the final slice in rootless: "/user.slice/user-1001.slice/user@1001.service/user.slice" // NOTE: systemdDbus.PropSlice requires the "/user.slice/user-1001.slice/user@1001.service/" prefix NOT to be specified. return slice, nil } func (m *UnifiedManager) initPath() error { if m.path != "" { return nil } sliceFull, err := m.getSliceFull() if err != nil { return err } c := m.cgroups path := filepath.Join(sliceFull, getUnitName(c)) path, err = securejoin.SecureJoin(fs2.UnifiedMountpoint, path) if err != nil { return err } // an example of the final path in rootless: // "/sys/fs/cgroup/user.slice/user-1001.slice/user@1001.service/user.slice/libpod-132ff0d72245e6f13a3bbc6cdc5376886897b60ac59eaa8dea1df7ab959cbf1c.scope" m.path = path return nil } func (m *UnifiedManager) Freeze(state cgroups.FreezerState) error { return m.fsMgr.Freeze(state) } func (m *UnifiedManager) GetPids() ([]int, error) { return cgroups.GetPids(m.path) } func (m *UnifiedManager) GetAllPids() ([]int, error) { return cgroups.GetAllPids(m.path) } func (m *UnifiedManager) GetStats() (*cgroups.Stats, error) { return m.fsMgr.GetStats() } func (m *UnifiedManager) Set(r *cgroups.Resources) error { if r == nil { return nil } // Use a copy since CpuQuota in r may be modified. rCopy := *r r = &rCopy properties, err := genV2ResourcesProperties(m.fsMgr.Path(""), r, m.dbus) if err != nil { return err } if err := setUnitProperties(m.dbus, getUnitName(m.cgroups), properties...); err != nil { return fmt.Errorf("unable to set unit properties: %w", err) } return m.fsMgr.Set(r) } func (m *UnifiedManager) GetPaths() map[string]string { paths := make(map[string]string, 1) paths[""] = m.path return paths } func (m *UnifiedManager) GetCgroups() (*cgroups.Cgroup, error) { return m.cgroups, nil } func (m *UnifiedManager) GetFreezerState() (cgroups.FreezerState, error) { return m.fsMgr.GetFreezerState() } func (m *UnifiedManager) Exists() bool { return cgroups.PathExists(m.path) } func (m *UnifiedManager) OOMKillCount() (uint64, error) { return m.fsMgr.OOMKillCount() } cgroups-0.0.4/utils.go000066400000000000000000000312741503527177300147050ustar00rootroot00000000000000package cgroups import ( "bufio" "errors" "fmt" "io" "math" "os" "path/filepath" "strconv" "strings" "sync" "time" "github.com/moby/sys/userns" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) const ( CgroupProcesses = "cgroup.procs" unifiedMountpoint = "/sys/fs/cgroup" hybridMountpoint = "/sys/fs/cgroup/unified" ) var ( isUnifiedOnce sync.Once isUnified bool isHybridOnce sync.Once isHybrid bool ) // IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode. func IsCgroup2UnifiedMode() bool { isUnifiedOnce.Do(func() { var st unix.Statfs_t err := unix.Statfs(unifiedMountpoint, &st) if err != nil { level := logrus.WarnLevel if os.IsNotExist(err) && userns.RunningInUserNS() { // For rootless containers, sweep it under the rug. level = logrus.DebugLevel } logrus.StandardLogger().Logf(level, "statfs %s: %v; assuming cgroup v1", unifiedMountpoint, err) } isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC }) return isUnified } // IsCgroup2HybridMode returns whether we are running in cgroup v2 hybrid mode. func IsCgroup2HybridMode() bool { isHybridOnce.Do(func() { var st unix.Statfs_t err := unix.Statfs(hybridMountpoint, &st) if err != nil { isHybrid = false if !os.IsNotExist(err) { // Report unexpected errors. logrus.WithError(err).Debugf("statfs(%q) failed", hybridMountpoint) } return } isHybrid = st.Type == unix.CGROUP2_SUPER_MAGIC }) return isHybrid } type Mount struct { Mountpoint string Root string Subsystems []string } // GetCgroupMounts returns the mounts for the cgroup subsystems. // all indicates whether to return just the first instance or all the mounts. // This function should not be used from cgroupv2 code, as in this case // all the controllers are available under the constant unifiedMountpoint. func GetCgroupMounts(all bool) ([]Mount, error) { if IsCgroup2UnifiedMode() { // TODO: remove cgroupv2 case once all external users are converted availableControllers, err := GetAllSubsystems() if err != nil { return nil, err } m := Mount{ Mountpoint: unifiedMountpoint, Root: unifiedMountpoint, Subsystems: availableControllers, } return []Mount{m}, nil } return getCgroupMountsV1(all) } // GetAllSubsystems returns all the cgroup subsystems supported by the kernel func GetAllSubsystems() ([]string, error) { // /proc/cgroups is meaningless for v2 // https://github.com/torvalds/linux/blob/v5.3/Documentation/admin-guide/cgroup-v2.rst#deprecated-v1-core-features if IsCgroup2UnifiedMode() { // "pseudo" controllers do not appear in /sys/fs/cgroup/cgroup.controllers. // - devices: implemented in kernel 4.15 // - freezer: implemented in kernel 5.2 // We assume these are always available, as it is hard to detect availability. pseudo := []string{"devices", "freezer"} data, err := ReadFile("/sys/fs/cgroup", "cgroup.controllers") if err != nil { return nil, err } subsystems := append(pseudo, strings.Fields(data)...) return subsystems, nil } f, err := os.Open("/proc/cgroups") if err != nil { return nil, err } defer f.Close() subsystems := []string{} s := bufio.NewScanner(f) for s.Scan() { text := s.Text() if text[0] != '#' { parts := strings.Fields(text) if len(parts) >= 4 && parts[3] != "0" { subsystems = append(subsystems, parts[0]) } } } if err := s.Err(); err != nil { return nil, err } return subsystems, nil } func readProcsFile(dir string) (out []int, _ error) { file := CgroupProcesses retry := true again: f, err := OpenFile(dir, file, os.O_RDONLY) if err != nil { return nil, err } defer f.Close() s := bufio.NewScanner(f) for s.Scan() { if t := s.Text(); t != "" { pid, err := strconv.Atoi(t) if err != nil { return nil, err } out = append(out, pid) } } if errors.Is(s.Err(), unix.ENOTSUP) && retry { // For a threaded cgroup, read returns ENOTSUP, and we should // read from cgroup.threads instead. file = "cgroup.threads" retry = false goto again } return out, s.Err() } // ParseCgroupFile parses the given cgroup file, typically /proc/self/cgroup // or /proc//cgroup, into a map of subsystems to cgroup paths, e.g. // // "cpu": "/user.slice/user-1000.slice" // "pids": "/user.slice/user-1000.slice" // // etc. // // Note that for cgroup v2 unified hierarchy, there are no per-controller // cgroup paths, so the resulting map will have a single element where the key // is empty string ("") and the value is the cgroup path the is in. func ParseCgroupFile(path string) (map[string]string, error) { f, err := os.Open(path) if err != nil { return nil, err } defer f.Close() return parseCgroupFromReader(f) } // helper function for ParseCgroupFile to make testing easier func parseCgroupFromReader(r io.Reader) (map[string]string, error) { s := bufio.NewScanner(r) cgroups := make(map[string]string) for s.Scan() { text := s.Text() // from cgroups(7): // /proc/[pid]/cgroup // ... // For each cgroup hierarchy ... there is one entry // containing three colon-separated fields of the form: // hierarchy-ID:subsystem-list:cgroup-path parts := strings.SplitN(text, ":", 3) if len(parts) < 3 { return nil, fmt.Errorf("invalid cgroup entry: must contain at least two colons: %v", text) } for _, subs := range strings.Split(parts[1], ",") { cgroups[subs] = parts[2] } } if err := s.Err(); err != nil { return nil, err } return cgroups, nil } func PathExists(path string) bool { if _, err := os.Stat(path); err != nil { return false } return true } // rmdir tries to remove a directory, optionally retrying on EBUSY. func rmdir(path string, retry bool) error { delay := time.Millisecond tries := 10 again: err := unix.Rmdir(path) switch err { case nil, unix.ENOENT: return nil case unix.EINTR: goto again case unix.EBUSY: if retry && tries > 0 { time.Sleep(delay) delay *= 2 tries-- goto again } } return &os.PathError{Op: "rmdir", Path: path, Err: err} } // RemovePath aims to remove cgroup path. It does so recursively, // by removing any subdirectories (sub-cgroups) first. func RemovePath(path string) error { // Try the fast path first; don't retry on EBUSY yet. if err := rmdir(path, false); err == nil { return nil } // There are many reasons why rmdir can fail, including: // 1. cgroup have existing sub-cgroups; // 2. cgroup (still) have some processes (that are about to vanish); // 3. lack of permission (one example is read-only /sys/fs/cgroup mount, // in which case rmdir returns EROFS even for for a non-existent path, // see issue 4518). // // Using os.ReadDir here kills two birds with one stone: check if // the directory exists (handling scenario 3 above), and use // directory contents to remove sub-cgroups (handling scenario 1). infos, err := os.ReadDir(path) if err != nil { if os.IsNotExist(err) { return nil } return err } // Let's remove sub-cgroups, if any. for _, info := range infos { if info.IsDir() { if err = RemovePath(filepath.Join(path, info.Name())); err != nil { return err } } } // Finally, try rmdir again, this time with retries on EBUSY, // which may help with scenario 2 above. return rmdir(path, true) } // RemovePaths iterates over the provided paths removing them. func RemovePaths(paths map[string]string) (err error) { for s, p := range paths { if err := RemovePath(p); err == nil { delete(paths, s) } } if len(paths) == 0 { clear(paths) return nil } return fmt.Errorf("Failed to remove paths: %v", paths) } var ( hugePageSizes []string initHPSOnce sync.Once ) func HugePageSizes() []string { initHPSOnce.Do(func() { dir, err := os.OpenFile("/sys/kernel/mm/hugepages", unix.O_DIRECTORY|unix.O_RDONLY, 0) if err != nil { return } files, err := dir.Readdirnames(0) dir.Close() if err != nil { return } hugePageSizes, err = getHugePageSizeFromFilenames(files) if err != nil { logrus.Warn("HugePageSizes: ", err) } }) return hugePageSizes } func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) { pageSizes := make([]string, 0, len(fileNames)) var warn error for _, file := range fileNames { // example: hugepages-1048576kB val, ok := strings.CutPrefix(file, "hugepages-") if !ok { // Unexpected file name: no prefix found, ignore it. continue } // The suffix is always "kB" (as of Linux 5.13). If we find // something else, produce an error but keep going. eLen := len(val) - 2 val = strings.TrimSuffix(val, "kB") if len(val) != eLen { // Highly unlikely. if warn == nil { warn = errors.New(file + `: invalid suffix (expected "kB")`) } continue } size, err := strconv.Atoi(val) if err != nil { // Highly unlikely. if warn == nil { warn = fmt.Errorf("%s: %w", file, err) } continue } // Model after https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/hugetlb_cgroup.c?id=eff48ddeab782e35e58ccc8853f7386bbae9dec4#n574 // but in our case the size is in KB already. if size >= (1 << 20) { val = strconv.Itoa(size>>20) + "GB" } else if size >= (1 << 10) { val = strconv.Itoa(size>>10) + "MB" } else { val += "KB" } pageSizes = append(pageSizes, val) } return pageSizes, warn } // GetPids returns all pids, that were added to cgroup at path. func GetPids(dir string) ([]int, error) { return readProcsFile(dir) } // WriteCgroupProc writes the specified pid into the cgroup's cgroup.procs file func WriteCgroupProc(dir string, pid int) error { // Normally dir should not be empty, one case is that cgroup subsystem // is not mounted, we will get empty dir, and we want it fail here. if dir == "" { return fmt.Errorf("no such directory for %s", CgroupProcesses) } // Dont attach any pid to the cgroup if -1 is specified as a pid if pid == -1 { return nil } file, err := OpenFile(dir, CgroupProcesses, os.O_WRONLY) if err != nil { return fmt.Errorf("failed to write %v: %w", pid, err) } defer file.Close() for range 5 { _, err = file.WriteString(strconv.Itoa(pid)) if err == nil { return nil } // EINVAL might mean that the task being added to cgroup.procs is in state // TASK_NEW. We should attempt to do so again. if errors.Is(err, unix.EINVAL) { time.Sleep(30 * time.Millisecond) continue } return fmt.Errorf("failed to write %v: %w", pid, err) } return err } // ConvertCPUSharesToCgroupV2Value converts CPU shares, used by cgroup v1, // to CPU weight, used by cgroup v2. // // Cgroup v1 CPU shares has a range of [2^1...2^18], i.e. [2...262144], // and the default value is 1024. // // Cgroup v2 CPU weight has a range of [10^0...10^4], i.e. [1...10000], // and the default value is 100. func ConvertCPUSharesToCgroupV2Value(cpuShares uint64) uint64 { // The value of 0 means "unset". if cpuShares == 0 { return 0 } if cpuShares <= 2 { return 1 } if cpuShares >= 262144 { return 10000 } l := math.Log2(float64(cpuShares)) // Quadratic function which fits min, max, and default. exponent := (l*l+125*l)/612.0 - 7.0/34.0 return uint64(math.Ceil(math.Pow(10, exponent))) } // ConvertMemorySwapToCgroupV2Value converts MemorySwap value from OCI spec // for use by cgroup v2 drivers. A conversion is needed since Resources.MemorySwap // is defined as memory+swap combined, while in cgroup v2 swap is a separate value, // so we need to subtract memory from it where it makes sense. func ConvertMemorySwapToCgroupV2Value(memorySwap, memory int64) (int64, error) { switch { case memory == -1 && memorySwap == 0: // For compatibility with cgroup1 controller, set swap to unlimited in // case the memory is set to unlimited and the swap is not explicitly set, // treating the request as "set both memory and swap to unlimited". return -1, nil case memorySwap == -1, memorySwap == 0: // Treat -1 ("max") and 0 ("unset") swap as is. return memorySwap, nil case memory == -1: // Unlimited memory, so treat swap as is. return memorySwap, nil case memory == 0: // Unset or unknown memory, can't calculate swap. return 0, errors.New("unable to set swap limit without memory limit") case memory < 0: // Does not make sense to subtract a negative value. return 0, fmt.Errorf("invalid memory value: %d", memory) case memorySwap < memory: // Sanity check. return 0, errors.New("memory+swap limit should be >= memory limit") } return memorySwap - memory, nil } // Since the OCI spec is designed for cgroup v1, in some cases // there is need to convert from the cgroup v1 configuration to cgroup v2 // the formula for BlkIOWeight to IOWeight is y = (1 + (x - 10) * 9999 / 990) // convert linearly from [10-1000] to [1-10000] func ConvertBlkIOToIOWeightValue(blkIoWeight uint16) uint64 { if blkIoWeight == 0 { return 0 } return 1 + (uint64(blkIoWeight)-10)*9999/990 } cgroups-0.0.4/utils_test.go000066400000000000000000001076271503527177300157520ustar00rootroot00000000000000package cgroups import ( "bytes" "errors" "path/filepath" "reflect" "slices" "strings" "testing" "github.com/moby/sys/mountinfo" "golang.org/x/sys/unix" ) const fedoraMountinfo = `15 35 0:3 / /proc rw,nosuid,nodev,noexec,relatime shared:5 - proc proc rw 16 35 0:14 / /sys rw,nosuid,nodev,noexec,relatime shared:6 - sysfs sysfs rw,seclabel 17 35 0:5 / /dev rw,nosuid shared:2 - devtmpfs devtmpfs rw,seclabel,size=8056484k,nr_inodes=2014121,mode=755 18 16 0:15 / /sys/kernel/security rw,nosuid,nodev,noexec,relatime shared:7 - securityfs securityfs rw 19 16 0:13 / /sys/fs/selinux rw,relatime shared:8 - selinuxfs selinuxfs rw 20 17 0:16 / /dev/shm rw,nosuid,nodev shared:3 - tmpfs tmpfs rw,seclabel 21 17 0:10 / /dev/pts rw,nosuid,noexec,relatime shared:4 - devpts devpts rw,seclabel,gid=5,mode=620,ptmxmode=000 22 35 0:17 / /run rw,nosuid,nodev shared:21 - tmpfs tmpfs rw,seclabel,mode=755 23 16 0:18 / /sys/fs/cgroup rw,nosuid,nodev,noexec shared:9 - tmpfs tmpfs rw,seclabel,mode=755 24 23 0:19 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:10 - cgroup cgroup rw,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd 25 16 0:20 / /sys/fs/pstore rw,nosuid,nodev,noexec,relatime shared:20 - pstore pstore rw 26 23 0:21 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,cpuset,clone_children 27 23 0:22 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:12 - cgroup cgroup rw,cpuacct,cpu,clone_children 28 23 0:23 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:13 - cgroup cgroup rw,memory,clone_children 29 23 0:24 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:14 - cgroup cgroup rw,devices,clone_children 30 23 0:25 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:15 - cgroup cgroup rw,freezer,clone_children 31 23 0:26 / /sys/fs/cgroup/net_cls rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,net_cls,clone_children 32 23 0:27 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,blkio,clone_children 33 23 0:28 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,perf_event,clone_children 34 23 0:29 / /sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime shared:19 - cgroup cgroup rw,hugetlb,clone_children 35 1 253:2 / / rw,relatime shared:1 - ext4 /dev/mapper/ssd-root--f20 rw,seclabel,data=ordered 36 15 0:30 / /proc/sys/fs/binfmt_misc rw,relatime shared:22 - autofs systemd-1 rw,fd=38,pgrp=1,timeout=300,minproto=5,maxproto=5,direct 37 17 0:12 / /dev/mqueue rw,relatime shared:23 - mqueue mqueue rw,seclabel 38 35 0:31 / /tmp rw shared:24 - tmpfs tmpfs rw,seclabel 39 17 0:32 / /dev/hugepages rw,relatime shared:25 - hugetlbfs hugetlbfs rw,seclabel 40 16 0:7 / /sys/kernel/debug rw,relatime shared:26 - debugfs debugfs rw 41 16 0:33 / /sys/kernel/config rw,relatime shared:27 - configfs configfs rw 42 35 0:34 / /var/lib/nfs/rpc_pipefs rw,relatime shared:28 - rpc_pipefs sunrpc rw 43 15 0:35 / /proc/fs/nfsd rw,relatime shared:29 - nfsd sunrpc rw 45 35 8:17 / /boot rw,relatime shared:30 - ext4 /dev/sdb1 rw,seclabel,data=ordered 46 35 253:4 / /home rw,relatime shared:31 - ext4 /dev/mapper/ssd-home rw,seclabel,data=ordered 47 35 253:5 / /var/lib/libvirt/images rw,noatime,nodiratime shared:32 - ext4 /dev/mapper/ssd-virt rw,seclabel,discard,data=ordered 48 35 253:12 / /mnt/old rw,relatime shared:33 - ext4 /dev/mapper/HelpDeskRHEL6-FedoraRoot rw,seclabel,data=ordered 121 22 0:36 / /run/user/1000/gvfs rw,nosuid,nodev,relatime shared:104 - fuse.gvfsd-fuse gvfsd-fuse rw,user_id=1000,group_id=1000 124 16 0:37 / /sys/fs/fuse/connections rw,relatime shared:107 - fusectl fusectl rw 165 38 253:3 / /tmp/mnt rw,relatime shared:147 - ext4 /dev/mapper/ssd-root rw,seclabel,data=ordered 167 35 253:15 / /var/lib/docker/devicemapper/mnt/aae4076022f0e2b80a2afbf8fc6df450c52080191fcef7fb679a73e6f073e5c2 rw,relatime shared:149 - ext4 /dev/mapper/docker-253:2-425882-aae4076022f0e2b80a2afbf8fc6df450c52080191fcef7fb679a73e6f073e5c2 rw,seclabel,discard,stripe=16,data=ordered 171 35 253:16 / /var/lib/docker/devicemapper/mnt/c71be651f114db95180e472f7871b74fa597ee70a58ccc35cb87139ddea15373 rw,relatime shared:153 - ext4 /dev/mapper/docker-253:2-425882-c71be651f114db95180e472f7871b74fa597ee70a58ccc35cb87139ddea15373 rw,seclabel,discard,stripe=16,data=ordered 175 35 253:17 / /var/lib/docker/devicemapper/mnt/1bac6ab72862d2d5626560df6197cf12036b82e258c53d981fa29adce6f06c3c rw,relatime shared:157 - ext4 /dev/mapper/docker-253:2-425882-1bac6ab72862d2d5626560df6197cf12036b82e258c53d981fa29adce6f06c3c rw,seclabel,discard,stripe=16,data=ordered 179 35 253:18 / /var/lib/docker/devicemapper/mnt/d710a357d77158e80d5b2c55710ae07c94e76d34d21ee7bae65ce5418f739b09 rw,relatime shared:161 - ext4 /dev/mapper/docker-253:2-425882-d710a357d77158e80d5b2c55710ae07c94e76d34d21ee7bae65ce5418f739b09 rw,seclabel,discard,stripe=16,data=ordered 183 35 253:19 / /var/lib/docker/devicemapper/mnt/6479f52366114d5f518db6837254baab48fab39f2ac38d5099250e9a6ceae6c7 rw,relatime shared:165 - ext4 /dev/mapper/docker-253:2-425882-6479f52366114d5f518db6837254baab48fab39f2ac38d5099250e9a6ceae6c7 rw,seclabel,discard,stripe=16,data=ordered 187 35 253:20 / /var/lib/docker/devicemapper/mnt/8d9df91c4cca5aef49eeb2725292aab324646f723a7feab56be34c2ad08268e1 rw,relatime shared:169 - ext4 /dev/mapper/docker-253:2-425882-8d9df91c4cca5aef49eeb2725292aab324646f723a7feab56be34c2ad08268e1 rw,seclabel,discard,stripe=16,data=ordered 191 35 253:21 / /var/lib/docker/devicemapper/mnt/c8240b768603d32e920d365dc9d1dc2a6af46cd23e7ae819947f969e1b4ec661 rw,relatime shared:173 - ext4 /dev/mapper/docker-253:2-425882-c8240b768603d32e920d365dc9d1dc2a6af46cd23e7ae819947f969e1b4ec661 rw,seclabel,discard,stripe=16,data=ordered 195 35 253:22 / /var/lib/docker/devicemapper/mnt/2eb3a01278380bbf3ed12d86ac629eaa70a4351301ee307a5cabe7b5f3b1615f rw,relatime shared:177 - ext4 /dev/mapper/docker-253:2-425882-2eb3a01278380bbf3ed12d86ac629eaa70a4351301ee307a5cabe7b5f3b1615f rw,seclabel,discard,stripe=16,data=ordered 199 35 253:23 / /var/lib/docker/devicemapper/mnt/37a17fb7c9d9b80821235d5f2662879bd3483915f245f9b49cdaa0e38779b70b rw,relatime shared:181 - ext4 /dev/mapper/docker-253:2-425882-37a17fb7c9d9b80821235d5f2662879bd3483915f245f9b49cdaa0e38779b70b rw,seclabel,discard,stripe=16,data=ordered 203 35 253:24 / /var/lib/docker/devicemapper/mnt/aea459ae930bf1de913e2f29428fd80ee678a1e962d4080019d9f9774331ee2b rw,relatime shared:185 - ext4 /dev/mapper/docker-253:2-425882-aea459ae930bf1de913e2f29428fd80ee678a1e962d4080019d9f9774331ee2b rw,seclabel,discard,stripe=16,data=ordered 207 35 253:25 / /var/lib/docker/devicemapper/mnt/928ead0bc06c454bd9f269e8585aeae0a6bd697f46dc8754c2a91309bc810882 rw,relatime shared:189 - ext4 /dev/mapper/docker-253:2-425882-928ead0bc06c454bd9f269e8585aeae0a6bd697f46dc8754c2a91309bc810882 rw,seclabel,discard,stripe=16,data=ordered 211 35 253:26 / /var/lib/docker/devicemapper/mnt/0f284d18481d671644706e7a7244cbcf63d590d634cc882cb8721821929d0420 rw,relatime shared:193 - ext4 /dev/mapper/docker-253:2-425882-0f284d18481d671644706e7a7244cbcf63d590d634cc882cb8721821929d0420 rw,seclabel,discard,stripe=16,data=ordered 215 35 253:27 / /var/lib/docker/devicemapper/mnt/d9dd16722ab34c38db2733e23f69e8f4803ce59658250dd63e98adff95d04919 rw,relatime shared:197 - ext4 /dev/mapper/docker-253:2-425882-d9dd16722ab34c38db2733e23f69e8f4803ce59658250dd63e98adff95d04919 rw,seclabel,discard,stripe=16,data=ordered 219 35 253:28 / /var/lib/docker/devicemapper/mnt/bc4500479f18c2c08c21ad5282e5f826a016a386177d9874c2764751c031d634 rw,relatime shared:201 - ext4 /dev/mapper/docker-253:2-425882-bc4500479f18c2c08c21ad5282e5f826a016a386177d9874c2764751c031d634 rw,seclabel,discard,stripe=16,data=ordered 223 35 253:29 / /var/lib/docker/devicemapper/mnt/7770c8b24eb3d5cc159a065910076938910d307ab2f5d94e1dc3b24c06ee2c8a rw,relatime shared:205 - ext4 /dev/mapper/docker-253:2-425882-7770c8b24eb3d5cc159a065910076938910d307ab2f5d94e1dc3b24c06ee2c8a rw,seclabel,discard,stripe=16,data=ordered 227 35 253:30 / /var/lib/docker/devicemapper/mnt/c280cd3d0bf0aa36b478b292279671624cceafc1a67eaa920fa1082601297adf rw,relatime shared:209 - ext4 /dev/mapper/docker-253:2-425882-c280cd3d0bf0aa36b478b292279671624cceafc1a67eaa920fa1082601297adf rw,seclabel,discard,stripe=16,data=ordered 231 35 253:31 / /var/lib/docker/devicemapper/mnt/8b59a7d9340279f09fea67fd6ad89ddef711e9e7050eb647984f8b5ef006335f rw,relatime shared:213 - ext4 /dev/mapper/docker-253:2-425882-8b59a7d9340279f09fea67fd6ad89ddef711e9e7050eb647984f8b5ef006335f rw,seclabel,discard,stripe=16,data=ordered 235 35 253:32 / /var/lib/docker/devicemapper/mnt/1a28059f29eda821578b1bb27a60cc71f76f846a551abefabce6efd0146dce9f rw,relatime shared:217 - ext4 /dev/mapper/docker-253:2-425882-1a28059f29eda821578b1bb27a60cc71f76f846a551abefabce6efd0146dce9f rw,seclabel,discard,stripe=16,data=ordered 239 35 253:33 / /var/lib/docker/devicemapper/mnt/e9aa60c60128cad1 rw,relatime shared:221 - ext4 /dev/mapper/docker-253:2-425882-e9aa60c60128cad1 rw,seclabel,discard,stripe=16,data=ordered 243 35 253:34 / /var/lib/docker/devicemapper/mnt/5fec11304b6f4713fea7b6ccdcc1adc0a1966187f590fe25a8227428a8df275d-init rw,relatime shared:225 - ext4 /dev/mapper/docker-253:2-425882-5fec11304b6f4713fea7b6ccdcc1adc0a1966187f590fe25a8227428a8df275d-init rw,seclabel,discard,stripe=16,data=ordered 247 35 253:35 / /var/lib/docker/devicemapper/mnt/5fec11304b6f4713fea7b6ccdcc1adc0a1966187f590fe25a8227428a8df275d rw,relatime shared:229 - ext4 /dev/mapper/docker-253:2-425882-5fec11304b6f4713fea7b6ccdcc1adc0a1966187f590fe25a8227428a8df275d rw,seclabel,discard,stripe=16,data=ordered 31 21 0:23 / /DATA/foo_bla_bla rw,relatime - cifs //foo/BLA\040BLA\040BLA/ rw,sec=ntlm,cache=loose,unc=\\foo\BLA BLA BLA,username=my_login,domain=mydomain.com,uid=12345678,forceuid,gid=12345678,forcegid,addr=10.1.30.10,file_mode=0755,dir_mode=0755,nounix,rsize=61440,wsize=65536,actimeo=1` const systemdMountinfo = `115 83 0:32 / / rw,relatime - aufs none rw,si=c0bd3d3,dio,dirperm1 116 115 0:35 / /proc rw,nosuid,nodev,noexec,relatime - proc proc rw 117 115 0:36 / /dev rw,nosuid - tmpfs tmpfs rw,mode=755 118 117 0:37 / /dev/pts rw,nosuid,noexec,relatime - devpts devpts rw,gid=5,mode=620,ptmxmode=666 119 115 0:38 / /sys rw,nosuid,nodev,noexec,relatime - sysfs sysfs rw 120 119 0:39 / /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime - tmpfs tmpfs rw,mode=755 121 120 0:19 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd 122 120 0:20 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,devices 123 120 0:21 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,freezer 124 120 0:22 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,memory 125 120 0:23 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,net_cls,net_prio 126 120 0:24 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,blkio 127 120 0:25 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,cpuset,clone_children 128 120 0:26 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,cpu,cpuacct 129 120 0:27 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,perf_event,release_agent=/run/cgmanager/agents/cgm-release-agent.perf_event 130 115 43:0 /var/lib/docker/volumes/a44a712176377f57c094397330ee04387284c478364eb25f4c3d25f775f25c26/_data /var/lib/docker rw,relatime - ext4 /dev/nbd0 rw,data=ordered 131 115 43:0 /var/lib/docker/containers/dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e/resolv.conf /etc/resolv.conf rw,relatime - ext4 /dev/nbd0 rw,data=ordered 132 115 43:0 /var/lib/docker/containers/dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e/hostname /etc/hostname rw,relatime - ext4 /dev/nbd0 rw,data=ordered 133 115 43:0 /var/lib/docker/containers/dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e/hosts /etc/hosts rw,relatime - ext4 /dev/nbd0 rw,data=ordered 134 117 0:33 / /dev/shm rw,nosuid,nodev,noexec,relatime - tmpfs shm rw,size=65536k 135 117 0:13 / /dev/mqueue rw,nosuid,nodev,noexec,relatime - mqueue mqueue rw 136 117 0:12 /1 /dev/console rw,nosuid,noexec,relatime - devpts none rw,gid=5,mode=620,ptmxmode=000 84 115 0:40 / /tmp rw,relatime - tmpfs none rw` const bedrockMountinfo = `120 17 0:28 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755 124 28 0:28 / /bedrock/strata/arch/sys/fs/cgroup rw,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755 123 53 0:28 / /bedrock/strata/fallback/sys/fs/cgroup rw,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755 122 71 0:28 / /bedrock/strata/gentoo/sys/fs/cgroup rw,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755 121 89 0:28 / /bedrock/strata/kde/sys/fs/cgroup rw,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755 125 120 0:29 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd 129 124 0:29 / /bedrock/strata/arch/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd 128 123 0:29 / /bedrock/strata/fallback/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd 127 122 0:29 / /bedrock/strata/gentoo/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd 126 121 0:29 / /bedrock/strata/kde/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd 140 120 0:32 / /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio 144 124 0:32 / /bedrock/strata/arch/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio 143 123 0:32 / /bedrock/strata/fallback/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio 142 122 0:32 / /bedrock/strata/gentoo/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio 141 121 0:32 / /bedrock/strata/kde/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio 145 120 0:33 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio 149 124 0:33 / /bedrock/strata/arch/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio 148 123 0:33 / /bedrock/strata/fallback/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio 147 122 0:33 / /bedrock/strata/gentoo/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio 146 121 0:33 / /bedrock/strata/kde/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio 150 120 0:34 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct 154 124 0:34 / /bedrock/strata/arch/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct 153 123 0:34 / /bedrock/strata/fallback/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct 152 122 0:34 / /bedrock/strata/gentoo/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct 151 121 0:34 / /bedrock/strata/kde/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct 155 120 0:35 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset 159 124 0:35 / /bedrock/strata/arch/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset 158 123 0:35 / /bedrock/strata/fallback/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset 157 122 0:35 / /bedrock/strata/gentoo/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset 156 121 0:35 / /bedrock/strata/kde/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset 160 120 0:36 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices 164 124 0:36 / /bedrock/strata/arch/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices 163 123 0:36 / /bedrock/strata/fallback/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices 162 122 0:36 / /bedrock/strata/gentoo/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices 161 121 0:36 / /bedrock/strata/kde/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices 165 120 0:37 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory 169 124 0:37 / /bedrock/strata/arch/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory 168 123 0:37 / /bedrock/strata/fallback/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory 167 122 0:37 / /bedrock/strata/gentoo/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory 166 121 0:37 / /bedrock/strata/kde/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory 170 120 0:38 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer 174 124 0:38 / /bedrock/strata/arch/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer 173 123 0:38 / /bedrock/strata/fallback/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer 172 122 0:38 / /bedrock/strata/gentoo/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer 171 121 0:38 / /bedrock/strata/kde/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer 175 120 0:39 / /sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids 179 124 0:39 / /bedrock/strata/arch/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids 178 123 0:39 / /bedrock/strata/fallback/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids 177 122 0:39 / /bedrock/strata/gentoo/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids 176 121 0:39 / /bedrock/strata/kde/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids 180 120 0:40 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event 184 124 0:40 / /bedrock/strata/arch/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event 183 123 0:40 / /bedrock/strata/fallback/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event 182 122 0:40 / /bedrock/strata/gentoo/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event 181 121 0:40 / /bedrock/strata/kde/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event` const cgroup2Mountinfo = `18 64 0:18 / /sys rw,nosuid,nodev,noexec,relatime shared:6 - sysfs sysfs rw,seclabel 19 64 0:4 / /proc rw,nosuid,nodev,noexec,relatime shared:5 - proc proc rw 20 64 0:6 / /dev rw,nosuid shared:2 - devtmpfs devtmpfs rw,seclabel,size=8171204k,nr_inodes=2042801,mode=755 21 18 0:19 / /sys/kernel/security rw,nosuid,nodev,noexec,relatime shared:7 - securityfs securityfs rw 22 20 0:20 / /dev/shm rw,nosuid,nodev shared:3 - tmpfs tmpfs rw,seclabel 23 20 0:21 / /dev/pts rw,nosuid,noexec,relatime shared:4 - devpts devpts rw,seclabel,gid=5,mode=620,ptmxmode=000 24 64 0:22 / /run rw,nosuid,nodev shared:24 - tmpfs tmpfs rw,seclabel,mode=755 25 18 0:23 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:8 - tmpfs tmpfs ro,seclabel,mode=755 26 25 0:24 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:9 - cgroup2 cgroup rw 27 18 0:25 / /sys/fs/pstore rw,nosuid,nodev,noexec,relatime shared:20 - pstore pstore rw,seclabel 28 18 0:26 / /sys/firmware/efi/efivars rw,nosuid,nodev,noexec,relatime shared:21 - efivarfs efivarfs rw 29 25 0:27 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:10 - cgroup cgroup rw,cpu,cpuacct 30 25 0:28 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,memory 31 25 0:29 / /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:12 - cgroup cgroup rw,net_cls,net_prio 32 25 0:30 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:13 - cgroup cgroup rw,blkio 33 25 0:31 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:14 - cgroup cgroup rw,perf_event 34 25 0:32 / /sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime shared:15 - cgroup cgroup rw,hugetlb 35 25 0:33 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,freezer 36 25 0:34 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,cpuset 37 25 0:35 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,devices 38 25 0:36 / /sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:19 - cgroup cgroup rw,pids 61 18 0:37 / /sys/kernel/config rw,relatime shared:22 - configfs configfs rw 64 0 253:0 / / rw,relatime shared:1 - ext4 /dev/mapper/fedora_dhcp--16--129-root rw,seclabel,data=ordered 39 18 0:17 / /sys/fs/selinux rw,relatime shared:23 - selinuxfs selinuxfs rw 40 20 0:16 / /dev/mqueue rw,relatime shared:25 - mqueue mqueue rw,seclabel 41 20 0:39 / /dev/hugepages rw,relatime shared:26 - hugetlbfs hugetlbfs rw,seclabel ` func TestGetCgroupMounts(t *testing.T) { type testData struct { mountInfo string root string // all is the total number of records expected with all=true, // or 0 for no extra records expected (most cases). all int subsystems map[string]bool } testTable := []testData{ { mountInfo: fedoraMountinfo, root: "/", subsystems: map[string]bool{ "name=systemd": false, "cpuset": false, "cpu": false, "cpuacct": false, "memory": false, "devices": false, "freezer": false, "net_cls": false, "blkio": false, "perf_event": false, "hugetlb": false, }, }, { mountInfo: systemdMountinfo, root: "/system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope", subsystems: map[string]bool{ "name=systemd": false, "cpuset": false, "cpu": false, "cpuacct": false, "memory": false, "devices": false, "freezer": false, "net_cls": false, "net_prio": false, "blkio": false, "perf_event": false, }, }, { mountInfo: bedrockMountinfo, root: "/", all: 50, subsystems: map[string]bool{ "name=systemd": false, "cpuset": false, "cpu": false, "cpuacct": false, "memory": false, "devices": false, "freezer": false, "net_cls": false, "net_prio": false, "blkio": false, "perf_event": false, "pids": false, }, }, } for _, td := range testTable { mi, err := mountinfo.GetMountsFromReader( bytes.NewBufferString(td.mountInfo), mountinfo.FSTypeFilter("cgroup"), ) if err != nil { t.Fatal(err) } cgMounts, err := getCgroupMountsHelper(td.subsystems, mi, false) if err != nil { t.Fatal(err) } cgMap := make(map[string]Mount) for _, m := range cgMounts { for _, ss := range m.Subsystems { cgMap[ss] = m } } for ss := range td.subsystems { ss = strings.TrimPrefix(ss, CgroupNamePrefix) m, ok := cgMap[ss] if !ok { t.Fatalf("%s not found", ss) } if m.Root != td.root { t.Fatalf("unexpected root for %s: %s", ss, m.Root) } if !strings.HasPrefix(m.Mountpoint, "/sys/fs/cgroup/") && !strings.Contains(m.Mountpoint, ss) { t.Fatalf("unexpected mountpoint for %s: %s", ss, m.Mountpoint) } if !slices.Contains(m.Subsystems, ss) { t.Fatalf("subsystem %s not found in Subsystems field %v", ss, m.Subsystems) } } // Test the all=true case. // Reset the test input. for k := range td.subsystems { td.subsystems[k] = false } cgMountsAll, err := getCgroupMountsHelper(td.subsystems, mi, true) if err != nil { t.Fatal(err) } if td.all == 0 { // Results with and without "all" should be the same. if len(cgMounts) != len(cgMountsAll) || !reflect.DeepEqual(cgMounts, cgMountsAll) { t.Errorf("expected same results, got (all=false) %v, (all=true) %v", cgMounts, cgMountsAll) } } else { // Make sure we got all records. if len(cgMountsAll) != td.all { t.Errorf("expected %d records, got %d (%+v)", td.all, len(cgMountsAll), cgMountsAll) } } } } func BenchmarkGetCgroupMounts(b *testing.B) { subsystems := map[string]bool{ "cpuset": false, "cpu": false, "cpuacct": false, "memory": false, "devices": false, "freezer": false, "net_cls": false, "blkio": false, "perf_event": false, "hugetlb": false, } mi, err := mountinfo.GetMountsFromReader( bytes.NewBufferString(fedoraMountinfo), mountinfo.FSTypeFilter("cgroup"), ) if err != nil { b.Fatal(err) } b.ResetTimer() for i := 0; i < b.N; i++ { if _, err := getCgroupMountsHelper(subsystems, mi, false); err != nil { b.Fatal(err) } } } func TestParseCgroupString(t *testing.T) { testCases := []struct { input string expectedError error expectedOutput map[string]string }{ { // Taken from a CoreOS instance running systemd 225 with CPU/Mem // accounting enabled in systemd input: `9:blkio:/ 8:freezer:/ 7:perf_event:/ 6:devices:/system.slice/system-sshd.slice 5:cpuset:/ 4:cpu,cpuacct:/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service 3:net_cls,net_prio:/ 2:memory:/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service 1:name=systemd:/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service`, expectedOutput: map[string]string{ "name=systemd": "/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service", "blkio": "/", "freezer": "/", "perf_event": "/", "devices": "/system.slice/system-sshd.slice", "cpuset": "/", "cpu": "/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service", "cpuacct": "/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service", "net_cls": "/", "net_prio": "/", "memory": "/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service", }, }, { input: `malformed input`, expectedError: errors.New(`invalid cgroup entry: must contain at least two colons: malformed input`), }, } for ndx, testCase := range testCases { out, err := parseCgroupFromReader(strings.NewReader(testCase.input)) if err != nil { if testCase.expectedError == nil || testCase.expectedError.Error() != err.Error() { t.Errorf("%v: expected error %v, got error %v", ndx, testCase.expectedError, err) } } else { if !reflect.DeepEqual(testCase.expectedOutput, out) { t.Errorf("%v: expected output %v, got error %v", ndx, testCase.expectedOutput, out) } } } } func TestIgnoreCgroup2Mount(t *testing.T) { subsystems := map[string]bool{ "cpuset": false, "cpu": false, "cpuacct": false, "memory": false, "devices": false, "freezer": false, "net_cls": false, "blkio": false, "perf_event": false, "pids": false, "name=systemd": false, } mi, err := mountinfo.GetMountsFromReader( bytes.NewBufferString(cgroup2Mountinfo), mountinfo.FSTypeFilter("cgroup"), ) if err != nil { t.Fatal(err) } cgMounts, err := getCgroupMountsHelper(subsystems, mi, false) if err != nil { t.Fatal(err) } for _, m := range cgMounts { if m.Mountpoint == "/sys/fs/cgroup/systemd" { t.Errorf("parsed a cgroup2 mount at /sys/fs/cgroup/systemd instead of ignoring it") } } } func TestFindCgroupMountpointAndRoot(t *testing.T) { fakeMountInfo := `35 27 0:29 / /foo rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,devices 35 27 0:29 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,devices` testCases := []struct { cgroupPath string output string }{ {cgroupPath: "/sys/fs", output: "/sys/fs/cgroup/devices"}, {cgroupPath: "", output: "/foo"}, } mi, err := mountinfo.GetMountsFromReader( bytes.NewBufferString(fakeMountInfo), mountinfo.FSTypeFilter("cgroup"), ) if err != nil { t.Fatal(err) } for _, c := range testCases { mountpoint, _, _ := findCgroupMountpointAndRootFromMI(mi, c.cgroupPath, "devices") if mountpoint != c.output { t.Errorf("expected %s, got %s", c.output, mountpoint) } } } func BenchmarkGetHugePageSizeImpl(b *testing.B) { var ( input = []string{"hugepages-1048576kB", "hugepages-2048kB", "hugepages-32768kB", "hugepages-64kB"} output []string err error ) for i := 0; i < b.N; i++ { output, err = getHugePageSizeFromFilenames(input) } if err != nil || len(output) != len(input) { b.Fatal("unexpected results") } } func TestGetHugePageSizeImpl(t *testing.T) { testCases := []struct { doc string input []string output []string isErr bool }{ { doc: "normal input", input: []string{"hugepages-1048576kB", "hugepages-2048kB", "hugepages-32768kB", "hugepages-64kB"}, output: []string{"1GB", "2MB", "32MB", "64KB"}, }, { doc: "empty input", input: []string{}, output: []string{}, }, { doc: "not a number", input: []string{"hugepages-akB"}, isErr: true, }, { doc: "no prefix (silently skipped)", input: []string{"1024kB"}, }, { doc: "invalid prefix (silently skipped)", input: []string{"whatever-1024kB"}, }, { doc: "invalid suffix", input: []string{"hugepages-1024gB"}, isErr: true, }, { doc: "no suffix", input: []string{"hugepages-1024"}, isErr: true, }, { doc: "mixed valid and invalid entries", input: []string{"hugepages-4194304kB", "hugepages-2048kB", "hugepages-akB", "hugepages-64kB"}, output: []string{"4GB", "2MB", "64KB"}, isErr: true, }, { doc: "more mixed valid and invalid entries", input: []string{"hugepages-2048kB", "hugepages-kB", "hugepages-64kB"}, output: []string{"2MB", "64KB"}, isErr: true, }, } for _, c := range testCases { c := c t.Run(c.doc, func(t *testing.T) { output, err := getHugePageSizeFromFilenames(c.input) t.Log("input:", c.input, "; output:", output, "; err:", err) if err != nil { if !c.isErr { t.Errorf("input %v, expected nil, got error: %v", c.input, err) } // no more checks return } if c.isErr { t.Errorf("input %v, expected error, got error: nil, output: %v", c.input, output) } // check output if len(output) != len(c.output) || (len(output) > 0 && !reflect.DeepEqual(output, c.output)) { t.Errorf("input %v, expected %v, got %v", c.input, c.output, output) } }) } } func TestConvertCPUSharesToCgroupV2Value(t *testing.T) { const ( sharesMin = 2 sharesMax = 262144 sharesDef = 1024 weightMin = 1 weightMax = 10000 weightDef = 100 unset = 0 ) cases := map[uint64]uint64{ unset: unset, sharesMin - 1: weightMin, // Below the minimum (out of range). sharesMin: weightMin, // Minimum. sharesMin + 1: weightMin + 1, // Just above the minimum. sharesDef: weightDef, // Default. sharesMax - 1: weightMax, // Just below the maximum. sharesMax: weightMax, // Maximum. sharesMax + 1: weightMax, // Above the maximum (out of range). } for shares, want := range cases { got := ConvertCPUSharesToCgroupV2Value(shares) if got != want { t.Errorf("ConvertCPUSharesToCgroupV2Value(%d): got %d, want %d", shares, got, want) } } } func TestConvertMemorySwapToCgroupV2Value(t *testing.T) { cases := []struct { descr string memswap, memory int64 expected int64 expErr bool }{ { descr: "all unset", memswap: 0, memory: 0, expected: 0, }, { descr: "unlimited memory+swap, unset memory", memswap: -1, memory: 0, expected: -1, }, { descr: "unlimited memory", memswap: 300, memory: -1, expected: 300, }, { descr: "all unlimited", memswap: -1, memory: -1, expected: -1, }, { descr: "negative memory+swap", memswap: -2, memory: 0, expErr: true, }, { descr: "unlimited memory+swap, set memory", memswap: -1, memory: 1000, expected: -1, }, { descr: "memory+swap == memory", memswap: 1000, memory: 1000, expected: 0, }, { descr: "memory+swap > memory", memswap: 500, memory: 200, expected: 300, }, { descr: "memory+swap < memory", memswap: 300, memory: 400, expErr: true, }, { descr: "unset memory", memswap: 300, memory: 0, expErr: true, }, { descr: "negative memory", memswap: 300, memory: -300, expErr: true, }, } for _, c := range cases { c := c t.Run(c.descr, func(t *testing.T) { swap, err := ConvertMemorySwapToCgroupV2Value(c.memswap, c.memory) if c.expErr { if err == nil { t.Errorf("memswap: %d, memory %d, expected error, got %d, nil", c.memswap, c.memory, swap) } // No more checks. return } if err != nil { t.Errorf("memswap: %d, memory %d, expected success, got error %s", c.memswap, c.memory, err) } if swap != c.expected { t.Errorf("memswap: %d, memory %d, expected %d, got %d", c.memswap, c.memory, c.expected, swap) } }) } } func TestConvertBlkIOToIOWeightValue(t *testing.T) { cases := map[uint16]uint64{ 0: 0, 10: 1, 1000: 10000, } for i, expected := range cases { got := ConvertBlkIOToIOWeightValue(i) if got != expected { t.Errorf("expected ConvertBlkIOToIOWeightValue(%d) to be %d, got %d", i, expected, got) } } } // TestRemovePathReadOnly is to test remove a non-existent dir in a ro mount point. // The similar issue example: https://github.com/opencontainers/runc/issues/4518 func TestRemovePathReadOnly(t *testing.T) { dirTo := t.TempDir() err := unix.Mount(t.TempDir(), dirTo, "", unix.MS_BIND, "") if err != nil { t.Skip("no permission of mount") } defer func() { _ = unix.Unmount(dirTo, 0) }() err = unix.Mount("", dirTo, "", unix.MS_REMOUNT|unix.MS_BIND|unix.MS_RDONLY, "") if err != nil { t.Skip("no permission of mount") } nonExistentDir := filepath.Join(dirTo, "non-existent-dir") err = rmdir(nonExistentDir, true) if !errors.Is(err, unix.EROFS) { t.Fatalf("expected the error of removing a non-existent dir %s in a ro mount point with rmdir to be unix.EROFS, but got: %v", nonExistentDir, err) } err = RemovePath(nonExistentDir) if err != nil { t.Fatalf("expected the error of removing a non-existent dir %s in a ro mount point with RemovePath to be nil, but got: %v", nonExistentDir, err) } } cgroups-0.0.4/v1_utils.go000066400000000000000000000152251503527177300153110ustar00rootroot00000000000000package cgroups import ( "errors" "fmt" "os" "path/filepath" "slices" "strings" "sync" "syscall" securejoin "github.com/cyphar/filepath-securejoin" "github.com/moby/sys/mountinfo" "golang.org/x/sys/unix" ) // Code in this source file are specific to cgroup v1, // and must not be used from any cgroup v2 code. const ( CgroupNamePrefix = "name=" defaultPrefix = "/sys/fs/cgroup" ) var ( errUnified = errors.New("not implemented for cgroup v2 unified hierarchy") ErrV1NoUnified = errors.New("invalid configuration: cannot use unified on cgroup v1") readMountinfoOnce sync.Once readMountinfoErr error cgroupMountinfo []*mountinfo.Info ) type NotFoundError struct { Subsystem string } func (e *NotFoundError) Error() string { return fmt.Sprintf("mountpoint for %s not found", e.Subsystem) } func NewNotFoundError(sub string) error { return &NotFoundError{ Subsystem: sub, } } func IsNotFound(err error) bool { var nfErr *NotFoundError return errors.As(err, &nfErr) } func tryDefaultPath(cgroupPath, subsystem string) string { if !strings.HasPrefix(defaultPrefix, cgroupPath) { return "" } // remove possible prefix subsystem = strings.TrimPrefix(subsystem, CgroupNamePrefix) // Make sure we're still under defaultPrefix, and resolve // a possible symlink (like cpu -> cpu,cpuacct). path, err := securejoin.SecureJoin(defaultPrefix, subsystem) if err != nil { return "" } // (1) path should be a directory. st, err := os.Lstat(path) if err != nil || !st.IsDir() { return "" } // (2) path should be a mount point. pst, err := os.Lstat(filepath.Dir(path)) if err != nil { return "" } if st.Sys().(*syscall.Stat_t).Dev == pst.Sys().(*syscall.Stat_t).Dev { // parent dir has the same dev -- path is not a mount point return "" } // (3) path should have 'cgroup' fs type. fst := unix.Statfs_t{} err = unix.Statfs(path, &fst) if err != nil || fst.Type != unix.CGROUP_SUPER_MAGIC { return "" } return path } // readCgroupMountinfo returns a list of cgroup v1 mounts (i.e. the ones // with fstype of "cgroup") for the current running process. // // The results are cached (to avoid re-reading mountinfo which is relatively // expensive), so it is assumed that cgroup mounts are not being changed. func readCgroupMountinfo() ([]*mountinfo.Info, error) { readMountinfoOnce.Do(func() { // mountinfo.GetMounts uses /proc/thread-self, so we can use it without // issues. cgroupMountinfo, readMountinfoErr = mountinfo.GetMounts( mountinfo.FSTypeFilter("cgroup"), ) }) return cgroupMountinfo, readMountinfoErr } // https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) { if IsCgroup2UnifiedMode() { return "", errUnified } // If subsystem is empty, we look for the cgroupv2 hybrid path. if len(subsystem) == 0 { return hybridMountpoint, nil } // Avoid parsing mountinfo by trying the default path first, if possible. if path := tryDefaultPath(cgroupPath, subsystem); path != "" { return path, nil } mnt, _, err := FindCgroupMountpointAndRoot(cgroupPath, subsystem) return mnt, err } func FindCgroupMountpointAndRoot(cgroupPath, subsystem string) (string, string, error) { if IsCgroup2UnifiedMode() { return "", "", errUnified } mi, err := readCgroupMountinfo() if err != nil { return "", "", err } return findCgroupMountpointAndRootFromMI(mi, cgroupPath, subsystem) } func findCgroupMountpointAndRootFromMI(mounts []*mountinfo.Info, cgroupPath, subsystem string) (string, string, error) { for _, mi := range mounts { if strings.HasPrefix(mi.Mountpoint, cgroupPath) { if slices.Contains(strings.Split(mi.VFSOptions, ","), subsystem) { return mi.Mountpoint, mi.Root, nil } } } return "", "", NewNotFoundError(subsystem) } func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) { if len(m.Subsystems) == 0 { return "", errors.New("no subsystem for mount") } return getControllerPath(m.Subsystems[0], cgroups) } func getCgroupMountsHelper(ss map[string]bool, mounts []*mountinfo.Info, all bool) ([]Mount, error) { res := make([]Mount, 0, len(ss)) numFound := 0 for _, mi := range mounts { m := Mount{ Mountpoint: mi.Mountpoint, Root: mi.Root, } for _, opt := range strings.Split(mi.VFSOptions, ",") { seen, known := ss[opt] if !known || (!all && seen) { continue } ss[opt] = true opt = strings.TrimPrefix(opt, CgroupNamePrefix) m.Subsystems = append(m.Subsystems, opt) numFound++ } if len(m.Subsystems) > 0 || all { res = append(res, m) } if !all && numFound >= len(ss) { break } } return res, nil } func getCgroupMountsV1(all bool) ([]Mount, error) { mi, err := readCgroupMountinfo() if err != nil { return nil, err } // We don't need to use /proc/thread-self here because runc always runs // with every thread in the same cgroup. This lets us avoid having to do // runtime.LockOSThread. allSubsystems, err := ParseCgroupFile("/proc/self/cgroup") if err != nil { return nil, err } allMap := make(map[string]bool) for s := range allSubsystems { allMap[s] = false } return getCgroupMountsHelper(allMap, mi, all) } // GetOwnCgroup returns the relative path to the cgroup docker is running in. func GetOwnCgroup(subsystem string) (string, error) { if IsCgroup2UnifiedMode() { return "", errUnified } // We don't need to use /proc/thread-self here because runc always runs // with every thread in the same cgroup. This lets us avoid having to do // runtime.LockOSThread. cgroups, err := ParseCgroupFile("/proc/self/cgroup") if err != nil { return "", err } return getControllerPath(subsystem, cgroups) } func GetOwnCgroupPath(subsystem string) (string, error) { cgroup, err := GetOwnCgroup(subsystem) if err != nil { return "", err } // If subsystem is empty, we look for the cgroupv2 hybrid path. if len(subsystem) == 0 { return hybridMountpoint, nil } return getCgroupPathHelper(subsystem, cgroup) } func getCgroupPathHelper(subsystem, cgroup string) (string, error) { mnt, root, err := FindCgroupMountpointAndRoot("", subsystem) if err != nil { return "", err } // This is needed for nested containers, because in /proc/self/cgroup we // see paths from host, which don't exist in container. relCgroup, err := filepath.Rel(root, cgroup) if err != nil { return "", err } return filepath.Join(mnt, relCgroup), nil } func getControllerPath(subsystem string, cgroups map[string]string) (string, error) { if IsCgroup2UnifiedMode() { return "", errUnified } if p, ok := cgroups[subsystem]; ok { return p, nil } if p, ok := cgroups[CgroupNamePrefix+subsystem]; ok { return p, nil } return "", NewNotFoundError(subsystem) }